diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/Kconfig | 5 | ||||
-rw-r--r-- | drivers/md/bitmap.c | 137 | ||||
-rw-r--r-- | drivers/md/bitmap.h | 5 | ||||
-rw-r--r-- | drivers/md/dm-crypt.c | 647 | ||||
-rw-r--r-- | drivers/md/dm-flakey.c | 272 | ||||
-rw-r--r-- | drivers/md/dm-io.c | 29 | ||||
-rw-r--r-- | drivers/md/dm-ioctl.c | 89 | ||||
-rw-r--r-- | drivers/md/dm-kcopyd.c | 45 | ||||
-rw-r--r-- | drivers/md/dm-log-userspace-base.c | 3 | ||||
-rw-r--r-- | drivers/md/dm-log.c | 32 | ||||
-rw-r--r-- | drivers/md/dm-mpath.c | 149 | ||||
-rw-r--r-- | drivers/md/dm-queue-length.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-raid.c | 621 | ||||
-rw-r--r-- | drivers/md/dm-snap-persistent.c | 80 | ||||
-rw-r--r-- | drivers/md/dm-snap.c | 84 | ||||
-rw-r--r-- | drivers/md/dm-table.c | 187 | ||||
-rw-r--r-- | drivers/md/dm.c | 75 | ||||
-rw-r--r-- | drivers/md/dm.h | 2 | ||||
-rw-r--r-- | drivers/md/linear.c | 8 | ||||
-rw-r--r-- | drivers/md/linear.h | 2 | ||||
-rw-r--r-- | drivers/md/md.c | 945 | ||||
-rw-r--r-- | drivers/md/md.h | 112 | ||||
-rw-r--r-- | drivers/md/multipath.c | 3 | ||||
-rw-r--r-- | drivers/md/raid1.c | 980 | ||||
-rw-r--r-- | drivers/md/raid1.h | 26 | ||||
-rw-r--r-- | drivers/md/raid10.c | 1209 | ||||
-rw-r--r-- | drivers/md/raid10.h | 21 | ||||
-rw-r--r-- | drivers/md/raid5.c | 1025 | ||||
-rw-r--r-- | drivers/md/raid5.h | 99 |
29 files changed, 4621 insertions, 2273 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 8420129fc5e..f75a66e7d31 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig | |||
@@ -241,12 +241,13 @@ config DM_MIRROR | |||
241 | needed for live data migration tools such as 'pvmove'. | 241 | needed for live data migration tools such as 'pvmove'. |
242 | 242 | ||
243 | config DM_RAID | 243 | config DM_RAID |
244 | tristate "RAID 4/5/6 target (EXPERIMENTAL)" | 244 | tristate "RAID 1/4/5/6 target (EXPERIMENTAL)" |
245 | depends on BLK_DEV_DM && EXPERIMENTAL | 245 | depends on BLK_DEV_DM && EXPERIMENTAL |
246 | select MD_RAID1 | ||
246 | select MD_RAID456 | 247 | select MD_RAID456 |
247 | select BLK_DEV_MD | 248 | select BLK_DEV_MD |
248 | ---help--- | 249 | ---help--- |
249 | A dm target that supports RAID4, RAID5 and RAID6 mappings | 250 | A dm target that supports RAID1, RAID4, RAID5 and RAID6 mappings |
250 | 251 | ||
251 | A RAID-5 set of N drives with a capacity of C MB per drive provides | 252 | A RAID-5 set of N drives with a capacity of C MB per drive provides |
252 | the capacity of C * (N - 1) MB, and protects against a failure | 253 | the capacity of C * (N - 1) MB, and protects against a failure |
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 574b09afedd..0dc6546b77a 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c | |||
@@ -29,7 +29,6 @@ | |||
29 | #include "md.h" | 29 | #include "md.h" |
30 | #include "bitmap.h" | 30 | #include "bitmap.h" |
31 | 31 | ||
32 | #include <linux/dm-dirty-log.h> | ||
33 | /* debug macros */ | 32 | /* debug macros */ |
34 | 33 | ||
35 | #define DEBUG 0 | 34 | #define DEBUG 0 |
@@ -775,10 +774,8 @@ static inline unsigned long file_page_offset(struct bitmap *bitmap, unsigned lon | |||
775 | * 0 or page 1 | 774 | * 0 or page 1 |
776 | */ | 775 | */ |
777 | static inline struct page *filemap_get_page(struct bitmap *bitmap, | 776 | static inline struct page *filemap_get_page(struct bitmap *bitmap, |
778 | unsigned long chunk) | 777 | unsigned long chunk) |
779 | { | 778 | { |
780 | if (bitmap->filemap == NULL) | ||
781 | return NULL; | ||
782 | if (file_page_index(bitmap, chunk) >= bitmap->file_pages) | 779 | if (file_page_index(bitmap, chunk) >= bitmap->file_pages) |
783 | return NULL; | 780 | return NULL; |
784 | return bitmap->filemap[file_page_index(bitmap, chunk) | 781 | return bitmap->filemap[file_page_index(bitmap, chunk) |
@@ -878,28 +875,19 @@ enum bitmap_page_attr { | |||
878 | static inline void set_page_attr(struct bitmap *bitmap, struct page *page, | 875 | static inline void set_page_attr(struct bitmap *bitmap, struct page *page, |
879 | enum bitmap_page_attr attr) | 876 | enum bitmap_page_attr attr) |
880 | { | 877 | { |
881 | if (page) | 878 | __set_bit((page->index<<2) + attr, bitmap->filemap_attr); |
882 | __set_bit((page->index<<2) + attr, bitmap->filemap_attr); | ||
883 | else | ||
884 | __set_bit(attr, &bitmap->logattrs); | ||
885 | } | 879 | } |
886 | 880 | ||
887 | static inline void clear_page_attr(struct bitmap *bitmap, struct page *page, | 881 | static inline void clear_page_attr(struct bitmap *bitmap, struct page *page, |
888 | enum bitmap_page_attr attr) | 882 | enum bitmap_page_attr attr) |
889 | { | 883 | { |
890 | if (page) | 884 | __clear_bit((page->index<<2) + attr, bitmap->filemap_attr); |
891 | __clear_bit((page->index<<2) + attr, bitmap->filemap_attr); | ||
892 | else | ||
893 | __clear_bit(attr, &bitmap->logattrs); | ||
894 | } | 885 | } |
895 | 886 | ||
896 | static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *page, | 887 | static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *page, |
897 | enum bitmap_page_attr attr) | 888 | enum bitmap_page_attr attr) |
898 | { | 889 | { |
899 | if (page) | 890 | return test_bit((page->index<<2) + attr, bitmap->filemap_attr); |
900 | return test_bit((page->index<<2) + attr, bitmap->filemap_attr); | ||
901 | else | ||
902 | return test_bit(attr, &bitmap->logattrs); | ||
903 | } | 891 | } |
904 | 892 | ||
905 | /* | 893 | /* |
@@ -912,30 +900,26 @@ static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *p | |||
912 | static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) | 900 | static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) |
913 | { | 901 | { |
914 | unsigned long bit; | 902 | unsigned long bit; |
915 | struct page *page = NULL; | 903 | struct page *page; |
916 | void *kaddr; | 904 | void *kaddr; |
917 | unsigned long chunk = block >> CHUNK_BLOCK_SHIFT(bitmap); | 905 | unsigned long chunk = block >> CHUNK_BLOCK_SHIFT(bitmap); |
918 | 906 | ||
919 | if (!bitmap->filemap) { | 907 | if (!bitmap->filemap) |
920 | struct dm_dirty_log *log = bitmap->mddev->bitmap_info.log; | 908 | return; |
921 | if (log) | ||
922 | log->type->mark_region(log, chunk); | ||
923 | } else { | ||
924 | 909 | ||
925 | page = filemap_get_page(bitmap, chunk); | 910 | page = filemap_get_page(bitmap, chunk); |
926 | if (!page) | 911 | if (!page) |
927 | return; | 912 | return; |
928 | bit = file_page_offset(bitmap, chunk); | 913 | bit = file_page_offset(bitmap, chunk); |
929 | 914 | ||
930 | /* set the bit */ | 915 | /* set the bit */ |
931 | kaddr = kmap_atomic(page, KM_USER0); | 916 | kaddr = kmap_atomic(page, KM_USER0); |
932 | if (bitmap->flags & BITMAP_HOSTENDIAN) | 917 | if (bitmap->flags & BITMAP_HOSTENDIAN) |
933 | set_bit(bit, kaddr); | 918 | set_bit(bit, kaddr); |
934 | else | 919 | else |
935 | __test_and_set_bit_le(bit, kaddr); | 920 | __set_bit_le(bit, kaddr); |
936 | kunmap_atomic(kaddr, KM_USER0); | 921 | kunmap_atomic(kaddr, KM_USER0); |
937 | PRINTK("set file bit %lu page %lu\n", bit, page->index); | 922 | PRINTK("set file bit %lu page %lu\n", bit, page->index); |
938 | } | ||
939 | /* record page number so it gets flushed to disk when unplug occurs */ | 923 | /* record page number so it gets flushed to disk when unplug occurs */ |
940 | set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); | 924 | set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); |
941 | } | 925 | } |
@@ -952,16 +936,6 @@ void bitmap_unplug(struct bitmap *bitmap) | |||
952 | 936 | ||
953 | if (!bitmap) | 937 | if (!bitmap) |
954 | return; | 938 | return; |
955 | if (!bitmap->filemap) { | ||
956 | /* Must be using a dirty_log */ | ||
957 | struct dm_dirty_log *log = bitmap->mddev->bitmap_info.log; | ||
958 | dirty = test_and_clear_bit(BITMAP_PAGE_DIRTY, &bitmap->logattrs); | ||
959 | need_write = test_and_clear_bit(BITMAP_PAGE_NEEDWRITE, &bitmap->logattrs); | ||
960 | if (dirty || need_write) | ||
961 | if (log->type->flush(log)) | ||
962 | bitmap->flags |= BITMAP_WRITE_ERROR; | ||
963 | goto out; | ||
964 | } | ||
965 | 939 | ||
966 | /* look at each page to see if there are any set bits that need to be | 940 | /* look at each page to see if there are any set bits that need to be |
967 | * flushed out to disk */ | 941 | * flushed out to disk */ |
@@ -990,7 +964,6 @@ void bitmap_unplug(struct bitmap *bitmap) | |||
990 | else | 964 | else |
991 | md_super_wait(bitmap->mddev); | 965 | md_super_wait(bitmap->mddev); |
992 | } | 966 | } |
993 | out: | ||
994 | if (bitmap->flags & BITMAP_WRITE_ERROR) | 967 | if (bitmap->flags & BITMAP_WRITE_ERROR) |
995 | bitmap_file_kick(bitmap); | 968 | bitmap_file_kick(bitmap); |
996 | } | 969 | } |
@@ -1199,7 +1172,6 @@ void bitmap_daemon_work(mddev_t *mddev) | |||
1199 | struct page *page = NULL, *lastpage = NULL; | 1172 | struct page *page = NULL, *lastpage = NULL; |
1200 | sector_t blocks; | 1173 | sector_t blocks; |
1201 | void *paddr; | 1174 | void *paddr; |
1202 | struct dm_dirty_log *log = mddev->bitmap_info.log; | ||
1203 | 1175 | ||
1204 | /* Use a mutex to guard daemon_work against | 1176 | /* Use a mutex to guard daemon_work against |
1205 | * bitmap_destroy. | 1177 | * bitmap_destroy. |
@@ -1224,12 +1196,11 @@ void bitmap_daemon_work(mddev_t *mddev) | |||
1224 | spin_lock_irqsave(&bitmap->lock, flags); | 1196 | spin_lock_irqsave(&bitmap->lock, flags); |
1225 | for (j = 0; j < bitmap->chunks; j++) { | 1197 | for (j = 0; j < bitmap->chunks; j++) { |
1226 | bitmap_counter_t *bmc; | 1198 | bitmap_counter_t *bmc; |
1227 | if (!bitmap->filemap) { | 1199 | if (!bitmap->filemap) |
1228 | if (!log) | 1200 | /* error or shutdown */ |
1229 | /* error or shutdown */ | 1201 | break; |
1230 | break; | 1202 | |
1231 | } else | 1203 | page = filemap_get_page(bitmap, j); |
1232 | page = filemap_get_page(bitmap, j); | ||
1233 | 1204 | ||
1234 | if (page != lastpage) { | 1205 | if (page != lastpage) { |
1235 | /* skip this page unless it's marked as needing cleaning */ | 1206 | /* skip this page unless it's marked as needing cleaning */ |
@@ -1298,17 +1269,16 @@ void bitmap_daemon_work(mddev_t *mddev) | |||
1298 | -1); | 1269 | -1); |
1299 | 1270 | ||
1300 | /* clear the bit */ | 1271 | /* clear the bit */ |
1301 | if (page) { | 1272 | paddr = kmap_atomic(page, KM_USER0); |
1302 | paddr = kmap_atomic(page, KM_USER0); | 1273 | if (bitmap->flags & BITMAP_HOSTENDIAN) |
1303 | if (bitmap->flags & BITMAP_HOSTENDIAN) | 1274 | clear_bit(file_page_offset(bitmap, j), |
1304 | clear_bit(file_page_offset(bitmap, j), | 1275 | paddr); |
1305 | paddr); | 1276 | else |
1306 | else | 1277 | __clear_bit_le( |
1307 | __test_and_clear_bit_le(file_page_offset(bitmap, j), | 1278 | file_page_offset(bitmap, |
1308 | paddr); | 1279 | j), |
1309 | kunmap_atomic(paddr, KM_USER0); | 1280 | paddr); |
1310 | } else | 1281 | kunmap_atomic(paddr, KM_USER0); |
1311 | log->type->clear_region(log, j); | ||
1312 | } | 1282 | } |
1313 | } else | 1283 | } else |
1314 | j |= PAGE_COUNTER_MASK; | 1284 | j |= PAGE_COUNTER_MASK; |
@@ -1316,16 +1286,12 @@ void bitmap_daemon_work(mddev_t *mddev) | |||
1316 | spin_unlock_irqrestore(&bitmap->lock, flags); | 1286 | spin_unlock_irqrestore(&bitmap->lock, flags); |
1317 | 1287 | ||
1318 | /* now sync the final page */ | 1288 | /* now sync the final page */ |
1319 | if (lastpage != NULL || log != NULL) { | 1289 | if (lastpage != NULL) { |
1320 | spin_lock_irqsave(&bitmap->lock, flags); | 1290 | spin_lock_irqsave(&bitmap->lock, flags); |
1321 | if (test_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE)) { | 1291 | if (test_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE)) { |
1322 | clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); | 1292 | clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); |
1323 | spin_unlock_irqrestore(&bitmap->lock, flags); | 1293 | spin_unlock_irqrestore(&bitmap->lock, flags); |
1324 | if (lastpage) | 1294 | write_page(bitmap, lastpage, 0); |
1325 | write_page(bitmap, lastpage, 0); | ||
1326 | else | ||
1327 | if (log->type->flush(log)) | ||
1328 | bitmap->flags |= BITMAP_WRITE_ERROR; | ||
1329 | } else { | 1295 | } else { |
1330 | set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); | 1296 | set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); |
1331 | spin_unlock_irqrestore(&bitmap->lock, flags); | 1297 | spin_unlock_irqrestore(&bitmap->lock, flags); |
@@ -1767,12 +1733,10 @@ int bitmap_create(mddev_t *mddev) | |||
1767 | BUILD_BUG_ON(sizeof(bitmap_super_t) != 256); | 1733 | BUILD_BUG_ON(sizeof(bitmap_super_t) != 256); |
1768 | 1734 | ||
1769 | if (!file | 1735 | if (!file |
1770 | && !mddev->bitmap_info.offset | 1736 | && !mddev->bitmap_info.offset) /* bitmap disabled, nothing to do */ |
1771 | && !mddev->bitmap_info.log) /* bitmap disabled, nothing to do */ | ||
1772 | return 0; | 1737 | return 0; |
1773 | 1738 | ||
1774 | BUG_ON(file && mddev->bitmap_info.offset); | 1739 | BUG_ON(file && mddev->bitmap_info.offset); |
1775 | BUG_ON(mddev->bitmap_info.offset && mddev->bitmap_info.log); | ||
1776 | 1740 | ||
1777 | bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); | 1741 | bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); |
1778 | if (!bitmap) | 1742 | if (!bitmap) |
@@ -1863,6 +1827,7 @@ int bitmap_create(mddev_t *mddev) | |||
1863 | int bitmap_load(mddev_t *mddev) | 1827 | int bitmap_load(mddev_t *mddev) |
1864 | { | 1828 | { |
1865 | int err = 0; | 1829 | int err = 0; |
1830 | sector_t start = 0; | ||
1866 | sector_t sector = 0; | 1831 | sector_t sector = 0; |
1867 | struct bitmap *bitmap = mddev->bitmap; | 1832 | struct bitmap *bitmap = mddev->bitmap; |
1868 | 1833 | ||
@@ -1881,24 +1846,14 @@ int bitmap_load(mddev_t *mddev) | |||
1881 | } | 1846 | } |
1882 | bitmap_close_sync(bitmap); | 1847 | bitmap_close_sync(bitmap); |
1883 | 1848 | ||
1884 | if (mddev->bitmap_info.log) { | 1849 | if (mddev->degraded == 0 |
1885 | unsigned long i; | 1850 | || bitmap->events_cleared == mddev->events) |
1886 | struct dm_dirty_log *log = mddev->bitmap_info.log; | 1851 | /* no need to keep dirty bits to optimise a |
1887 | for (i = 0; i < bitmap->chunks; i++) | 1852 | * re-add of a missing device */ |
1888 | if (!log->type->in_sync(log, i, 1)) | 1853 | start = mddev->recovery_cp; |
1889 | bitmap_set_memory_bits(bitmap, | 1854 | |
1890 | (sector_t)i << CHUNK_BLOCK_SHIFT(bitmap), | 1855 | err = bitmap_init_from_disk(bitmap, start); |
1891 | 1); | 1856 | |
1892 | } else { | ||
1893 | sector_t start = 0; | ||
1894 | if (mddev->degraded == 0 | ||
1895 | || bitmap->events_cleared == mddev->events) | ||
1896 | /* no need to keep dirty bits to optimise a | ||
1897 | * re-add of a missing device */ | ||
1898 | start = mddev->recovery_cp; | ||
1899 | |||
1900 | err = bitmap_init_from_disk(bitmap, start); | ||
1901 | } | ||
1902 | if (err) | 1857 | if (err) |
1903 | goto out; | 1858 | goto out; |
1904 | 1859 | ||
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h index b2a127e891a..a28f2e5588c 100644 --- a/drivers/md/bitmap.h +++ b/drivers/md/bitmap.h | |||
@@ -212,10 +212,6 @@ struct bitmap { | |||
212 | unsigned long file_pages; /* number of pages in the file */ | 212 | unsigned long file_pages; /* number of pages in the file */ |
213 | int last_page_size; /* bytes in the last page */ | 213 | int last_page_size; /* bytes in the last page */ |
214 | 214 | ||
215 | unsigned long logattrs; /* used when filemap_attr doesn't exist | ||
216 | * because we are working with a dirty_log | ||
217 | */ | ||
218 | |||
219 | unsigned long flags; | 215 | unsigned long flags; |
220 | 216 | ||
221 | int allclean; | 217 | int allclean; |
@@ -237,7 +233,6 @@ struct bitmap { | |||
237 | wait_queue_head_t behind_wait; | 233 | wait_queue_head_t behind_wait; |
238 | 234 | ||
239 | struct sysfs_dirent *sysfs_can_clear; | 235 | struct sysfs_dirent *sysfs_can_clear; |
240 | |||
241 | }; | 236 | }; |
242 | 237 | ||
243 | /* the bitmap API */ | 238 | /* the bitmap API */ |
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index c8827ffd85b..1f1d3423d39 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c | |||
@@ -18,19 +18,14 @@ | |||
18 | #include <linux/crypto.h> | 18 | #include <linux/crypto.h> |
19 | #include <linux/workqueue.h> | 19 | #include <linux/workqueue.h> |
20 | #include <linux/backing-dev.h> | 20 | #include <linux/backing-dev.h> |
21 | #include <linux/percpu.h> | ||
22 | #include <asm/atomic.h> | 21 | #include <asm/atomic.h> |
23 | #include <linux/scatterlist.h> | 22 | #include <linux/scatterlist.h> |
24 | #include <asm/page.h> | 23 | #include <asm/page.h> |
25 | #include <asm/unaligned.h> | 24 | #include <asm/unaligned.h> |
26 | #include <crypto/hash.h> | ||
27 | #include <crypto/md5.h> | ||
28 | #include <crypto/algapi.h> | ||
29 | 25 | ||
30 | #include <linux/device-mapper.h> | 26 | #include <linux/device-mapper.h> |
31 | 27 | ||
32 | #define DM_MSG_PREFIX "crypt" | 28 | #define DM_MSG_PREFIX "crypt" |
33 | #define MESG_STR(x) x, sizeof(x) | ||
34 | 29 | ||
35 | /* | 30 | /* |
36 | * context holding the current state of a multi-part conversion | 31 | * context holding the current state of a multi-part conversion |
@@ -67,7 +62,6 @@ struct dm_crypt_request { | |||
67 | struct convert_context *ctx; | 62 | struct convert_context *ctx; |
68 | struct scatterlist sg_in; | 63 | struct scatterlist sg_in; |
69 | struct scatterlist sg_out; | 64 | struct scatterlist sg_out; |
70 | sector_t iv_sector; | ||
71 | }; | 65 | }; |
72 | 66 | ||
73 | struct crypt_config; | 67 | struct crypt_config; |
@@ -78,13 +72,11 @@ struct crypt_iv_operations { | |||
78 | void (*dtr)(struct crypt_config *cc); | 72 | void (*dtr)(struct crypt_config *cc); |
79 | int (*init)(struct crypt_config *cc); | 73 | int (*init)(struct crypt_config *cc); |
80 | int (*wipe)(struct crypt_config *cc); | 74 | int (*wipe)(struct crypt_config *cc); |
81 | int (*generator)(struct crypt_config *cc, u8 *iv, | 75 | int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector); |
82 | struct dm_crypt_request *dmreq); | ||
83 | int (*post)(struct crypt_config *cc, u8 *iv, | ||
84 | struct dm_crypt_request *dmreq); | ||
85 | }; | 76 | }; |
86 | 77 | ||
87 | struct iv_essiv_private { | 78 | struct iv_essiv_private { |
79 | struct crypto_cipher *tfm; | ||
88 | struct crypto_hash *hash_tfm; | 80 | struct crypto_hash *hash_tfm; |
89 | u8 *salt; | 81 | u8 *salt; |
90 | }; | 82 | }; |
@@ -93,32 +85,11 @@ struct iv_benbi_private { | |||
93 | int shift; | 85 | int shift; |
94 | }; | 86 | }; |
95 | 87 | ||
96 | #define LMK_SEED_SIZE 64 /* hash + 0 */ | ||
97 | struct iv_lmk_private { | ||
98 | struct crypto_shash *hash_tfm; | ||
99 | u8 *seed; | ||
100 | }; | ||
101 | |||
102 | /* | 88 | /* |
103 | * Crypt: maps a linear range of a block device | 89 | * Crypt: maps a linear range of a block device |
104 | * and encrypts / decrypts at the same time. | 90 | * and encrypts / decrypts at the same time. |
105 | */ | 91 | */ |
106 | enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID }; | 92 | enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID }; |
107 | |||
108 | /* | ||
109 | * Duplicated per-CPU state for cipher. | ||
110 | */ | ||
111 | struct crypt_cpu { | ||
112 | struct ablkcipher_request *req; | ||
113 | /* ESSIV: struct crypto_cipher *essiv_tfm */ | ||
114 | void *iv_private; | ||
115 | struct crypto_ablkcipher *tfms[0]; | ||
116 | }; | ||
117 | |||
118 | /* | ||
119 | * The fields in here must be read only after initialization, | ||
120 | * changing state should be in crypt_cpu. | ||
121 | */ | ||
122 | struct crypt_config { | 93 | struct crypt_config { |
123 | struct dm_dev *dev; | 94 | struct dm_dev *dev; |
124 | sector_t start; | 95 | sector_t start; |
@@ -142,19 +113,11 @@ struct crypt_config { | |||
142 | union { | 113 | union { |
143 | struct iv_essiv_private essiv; | 114 | struct iv_essiv_private essiv; |
144 | struct iv_benbi_private benbi; | 115 | struct iv_benbi_private benbi; |
145 | struct iv_lmk_private lmk; | ||
146 | } iv_gen_private; | 116 | } iv_gen_private; |
147 | sector_t iv_offset; | 117 | sector_t iv_offset; |
148 | unsigned int iv_size; | 118 | unsigned int iv_size; |
149 | 119 | ||
150 | /* | 120 | /* |
151 | * Duplicated per cpu state. Access through | ||
152 | * per_cpu_ptr() only. | ||
153 | */ | ||
154 | struct crypt_cpu __percpu *cpu; | ||
155 | unsigned tfms_count; | ||
156 | |||
157 | /* | ||
158 | * Layout of each crypto request: | 121 | * Layout of each crypto request: |
159 | * | 122 | * |
160 | * struct ablkcipher_request | 123 | * struct ablkcipher_request |
@@ -168,10 +131,11 @@ struct crypt_config { | |||
168 | * correctly aligned. | 131 | * correctly aligned. |
169 | */ | 132 | */ |
170 | unsigned int dmreq_start; | 133 | unsigned int dmreq_start; |
134 | struct ablkcipher_request *req; | ||
171 | 135 | ||
136 | struct crypto_ablkcipher *tfm; | ||
172 | unsigned long flags; | 137 | unsigned long flags; |
173 | unsigned int key_size; | 138 | unsigned int key_size; |
174 | unsigned int key_parts; | ||
175 | u8 key[0]; | 139 | u8 key[0]; |
176 | }; | 140 | }; |
177 | 141 | ||
@@ -183,20 +147,6 @@ static struct kmem_cache *_crypt_io_pool; | |||
183 | 147 | ||
184 | static void clone_init(struct dm_crypt_io *, struct bio *); | 148 | static void clone_init(struct dm_crypt_io *, struct bio *); |
185 | static void kcryptd_queue_crypt(struct dm_crypt_io *io); | 149 | static void kcryptd_queue_crypt(struct dm_crypt_io *io); |
186 | static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq); | ||
187 | |||
188 | static struct crypt_cpu *this_crypt_config(struct crypt_config *cc) | ||
189 | { | ||
190 | return this_cpu_ptr(cc->cpu); | ||
191 | } | ||
192 | |||
193 | /* | ||
194 | * Use this to access cipher attributes that are the same for each CPU. | ||
195 | */ | ||
196 | static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc) | ||
197 | { | ||
198 | return __this_cpu_ptr(cc->cpu)->tfms[0]; | ||
199 | } | ||
200 | 150 | ||
201 | /* | 151 | /* |
202 | * Different IV generation algorithms: | 152 | * Different IV generation algorithms: |
@@ -217,38 +167,23 @@ static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc) | |||
217 | * null: the initial vector is always zero. Provides compatibility with | 167 | * null: the initial vector is always zero. Provides compatibility with |
218 | * obsolete loop_fish2 devices. Do not use for new devices. | 168 | * obsolete loop_fish2 devices. Do not use for new devices. |
219 | * | 169 | * |
220 | * lmk: Compatible implementation of the block chaining mode used | ||
221 | * by the Loop-AES block device encryption system | ||
222 | * designed by Jari Ruusu. See http://loop-aes.sourceforge.net/ | ||
223 | * It operates on full 512 byte sectors and uses CBC | ||
224 | * with an IV derived from the sector number, the data and | ||
225 | * optionally extra IV seed. | ||
226 | * This means that after decryption the first block | ||
227 | * of sector must be tweaked according to decrypted data. | ||
228 | * Loop-AES can use three encryption schemes: | ||
229 | * version 1: is plain aes-cbc mode | ||
230 | * version 2: uses 64 multikey scheme with lmk IV generator | ||
231 | * version 3: the same as version 2 with additional IV seed | ||
232 | * (it uses 65 keys, last key is used as IV seed) | ||
233 | * | ||
234 | * plumb: unimplemented, see: | 170 | * plumb: unimplemented, see: |
235 | * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454 | 171 | * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454 |
236 | */ | 172 | */ |
237 | 173 | ||
238 | static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, | 174 | static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, sector_t sector) |
239 | struct dm_crypt_request *dmreq) | ||
240 | { | 175 | { |
241 | memset(iv, 0, cc->iv_size); | 176 | memset(iv, 0, cc->iv_size); |
242 | *(u32 *)iv = cpu_to_le32(dmreq->iv_sector & 0xffffffff); | 177 | *(u32 *)iv = cpu_to_le32(sector & 0xffffffff); |
243 | 178 | ||
244 | return 0; | 179 | return 0; |
245 | } | 180 | } |
246 | 181 | ||
247 | static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv, | 182 | static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv, |
248 | struct dm_crypt_request *dmreq) | 183 | sector_t sector) |
249 | { | 184 | { |
250 | memset(iv, 0, cc->iv_size); | 185 | memset(iv, 0, cc->iv_size); |
251 | *(u64 *)iv = cpu_to_le64(dmreq->iv_sector); | 186 | *(u64 *)iv = cpu_to_le64(sector); |
252 | 187 | ||
253 | return 0; | 188 | return 0; |
254 | } | 189 | } |
@@ -259,8 +194,7 @@ static int crypt_iv_essiv_init(struct crypt_config *cc) | |||
259 | struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; | 194 | struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; |
260 | struct hash_desc desc; | 195 | struct hash_desc desc; |
261 | struct scatterlist sg; | 196 | struct scatterlist sg; |
262 | struct crypto_cipher *essiv_tfm; | 197 | int err; |
263 | int err, cpu; | ||
264 | 198 | ||
265 | sg_init_one(&sg, cc->key, cc->key_size); | 199 | sg_init_one(&sg, cc->key, cc->key_size); |
266 | desc.tfm = essiv->hash_tfm; | 200 | desc.tfm = essiv->hash_tfm; |
@@ -270,16 +204,8 @@ static int crypt_iv_essiv_init(struct crypt_config *cc) | |||
270 | if (err) | 204 | if (err) |
271 | return err; | 205 | return err; |
272 | 206 | ||
273 | for_each_possible_cpu(cpu) { | 207 | return crypto_cipher_setkey(essiv->tfm, essiv->salt, |
274 | essiv_tfm = per_cpu_ptr(cc->cpu, cpu)->iv_private, | ||
275 | |||
276 | err = crypto_cipher_setkey(essiv_tfm, essiv->salt, | ||
277 | crypto_hash_digestsize(essiv->hash_tfm)); | 208 | crypto_hash_digestsize(essiv->hash_tfm)); |
278 | if (err) | ||
279 | return err; | ||
280 | } | ||
281 | |||
282 | return 0; | ||
283 | } | 209 | } |
284 | 210 | ||
285 | /* Wipe salt and reset key derived from volume key */ | 211 | /* Wipe salt and reset key derived from volume key */ |
@@ -287,76 +213,24 @@ static int crypt_iv_essiv_wipe(struct crypt_config *cc) | |||
287 | { | 213 | { |
288 | struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; | 214 | struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; |
289 | unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm); | 215 | unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm); |
290 | struct crypto_cipher *essiv_tfm; | ||
291 | int cpu, r, err = 0; | ||
292 | 216 | ||
293 | memset(essiv->salt, 0, salt_size); | 217 | memset(essiv->salt, 0, salt_size); |
294 | 218 | ||
295 | for_each_possible_cpu(cpu) { | 219 | return crypto_cipher_setkey(essiv->tfm, essiv->salt, salt_size); |
296 | essiv_tfm = per_cpu_ptr(cc->cpu, cpu)->iv_private; | ||
297 | r = crypto_cipher_setkey(essiv_tfm, essiv->salt, salt_size); | ||
298 | if (r) | ||
299 | err = r; | ||
300 | } | ||
301 | |||
302 | return err; | ||
303 | } | ||
304 | |||
305 | /* Set up per cpu cipher state */ | ||
306 | static struct crypto_cipher *setup_essiv_cpu(struct crypt_config *cc, | ||
307 | struct dm_target *ti, | ||
308 | u8 *salt, unsigned saltsize) | ||
309 | { | ||
310 | struct crypto_cipher *essiv_tfm; | ||
311 | int err; | ||
312 | |||
313 | /* Setup the essiv_tfm with the given salt */ | ||
314 | essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC); | ||
315 | if (IS_ERR(essiv_tfm)) { | ||
316 | ti->error = "Error allocating crypto tfm for ESSIV"; | ||
317 | return essiv_tfm; | ||
318 | } | ||
319 | |||
320 | if (crypto_cipher_blocksize(essiv_tfm) != | ||
321 | crypto_ablkcipher_ivsize(any_tfm(cc))) { | ||
322 | ti->error = "Block size of ESSIV cipher does " | ||
323 | "not match IV size of block cipher"; | ||
324 | crypto_free_cipher(essiv_tfm); | ||
325 | return ERR_PTR(-EINVAL); | ||
326 | } | ||
327 | |||
328 | err = crypto_cipher_setkey(essiv_tfm, salt, saltsize); | ||
329 | if (err) { | ||
330 | ti->error = "Failed to set key for ESSIV cipher"; | ||
331 | crypto_free_cipher(essiv_tfm); | ||
332 | return ERR_PTR(err); | ||
333 | } | ||
334 | |||
335 | return essiv_tfm; | ||
336 | } | 220 | } |
337 | 221 | ||
338 | static void crypt_iv_essiv_dtr(struct crypt_config *cc) | 222 | static void crypt_iv_essiv_dtr(struct crypt_config *cc) |
339 | { | 223 | { |
340 | int cpu; | ||
341 | struct crypt_cpu *cpu_cc; | ||
342 | struct crypto_cipher *essiv_tfm; | ||
343 | struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; | 224 | struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; |
344 | 225 | ||
226 | crypto_free_cipher(essiv->tfm); | ||
227 | essiv->tfm = NULL; | ||
228 | |||
345 | crypto_free_hash(essiv->hash_tfm); | 229 | crypto_free_hash(essiv->hash_tfm); |
346 | essiv->hash_tfm = NULL; | 230 | essiv->hash_tfm = NULL; |
347 | 231 | ||
348 | kzfree(essiv->salt); | 232 | kzfree(essiv->salt); |
349 | essiv->salt = NULL; | 233 | essiv->salt = NULL; |
350 | |||
351 | for_each_possible_cpu(cpu) { | ||
352 | cpu_cc = per_cpu_ptr(cc->cpu, cpu); | ||
353 | essiv_tfm = cpu_cc->iv_private; | ||
354 | |||
355 | if (essiv_tfm) | ||
356 | crypto_free_cipher(essiv_tfm); | ||
357 | |||
358 | cpu_cc->iv_private = NULL; | ||
359 | } | ||
360 | } | 234 | } |
361 | 235 | ||
362 | static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, | 236 | static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, |
@@ -365,7 +239,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, | |||
365 | struct crypto_cipher *essiv_tfm = NULL; | 239 | struct crypto_cipher *essiv_tfm = NULL; |
366 | struct crypto_hash *hash_tfm = NULL; | 240 | struct crypto_hash *hash_tfm = NULL; |
367 | u8 *salt = NULL; | 241 | u8 *salt = NULL; |
368 | int err, cpu; | 242 | int err; |
369 | 243 | ||
370 | if (!opts) { | 244 | if (!opts) { |
371 | ti->error = "Digest algorithm missing for ESSIV mode"; | 245 | ti->error = "Digest algorithm missing for ESSIV mode"; |
@@ -387,44 +261,48 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, | |||
387 | goto bad; | 261 | goto bad; |
388 | } | 262 | } |
389 | 263 | ||
264 | /* Allocate essiv_tfm */ | ||
265 | essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC); | ||
266 | if (IS_ERR(essiv_tfm)) { | ||
267 | ti->error = "Error allocating crypto tfm for ESSIV"; | ||
268 | err = PTR_ERR(essiv_tfm); | ||
269 | goto bad; | ||
270 | } | ||
271 | if (crypto_cipher_blocksize(essiv_tfm) != | ||
272 | crypto_ablkcipher_ivsize(cc->tfm)) { | ||
273 | ti->error = "Block size of ESSIV cipher does " | ||
274 | "not match IV size of block cipher"; | ||
275 | err = -EINVAL; | ||
276 | goto bad; | ||
277 | } | ||
278 | |||
390 | cc->iv_gen_private.essiv.salt = salt; | 279 | cc->iv_gen_private.essiv.salt = salt; |
280 | cc->iv_gen_private.essiv.tfm = essiv_tfm; | ||
391 | cc->iv_gen_private.essiv.hash_tfm = hash_tfm; | 281 | cc->iv_gen_private.essiv.hash_tfm = hash_tfm; |
392 | 282 | ||
393 | for_each_possible_cpu(cpu) { | ||
394 | essiv_tfm = setup_essiv_cpu(cc, ti, salt, | ||
395 | crypto_hash_digestsize(hash_tfm)); | ||
396 | if (IS_ERR(essiv_tfm)) { | ||
397 | crypt_iv_essiv_dtr(cc); | ||
398 | return PTR_ERR(essiv_tfm); | ||
399 | } | ||
400 | per_cpu_ptr(cc->cpu, cpu)->iv_private = essiv_tfm; | ||
401 | } | ||
402 | |||
403 | return 0; | 283 | return 0; |
404 | 284 | ||
405 | bad: | 285 | bad: |
286 | if (essiv_tfm && !IS_ERR(essiv_tfm)) | ||
287 | crypto_free_cipher(essiv_tfm); | ||
406 | if (hash_tfm && !IS_ERR(hash_tfm)) | 288 | if (hash_tfm && !IS_ERR(hash_tfm)) |
407 | crypto_free_hash(hash_tfm); | 289 | crypto_free_hash(hash_tfm); |
408 | kfree(salt); | 290 | kfree(salt); |
409 | return err; | 291 | return err; |
410 | } | 292 | } |
411 | 293 | ||
412 | static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, | 294 | static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector) |
413 | struct dm_crypt_request *dmreq) | ||
414 | { | 295 | { |
415 | struct crypto_cipher *essiv_tfm = this_crypt_config(cc)->iv_private; | ||
416 | |||
417 | memset(iv, 0, cc->iv_size); | 296 | memset(iv, 0, cc->iv_size); |
418 | *(u64 *)iv = cpu_to_le64(dmreq->iv_sector); | 297 | *(u64 *)iv = cpu_to_le64(sector); |
419 | crypto_cipher_encrypt_one(essiv_tfm, iv, iv); | 298 | crypto_cipher_encrypt_one(cc->iv_gen_private.essiv.tfm, iv, iv); |
420 | |||
421 | return 0; | 299 | return 0; |
422 | } | 300 | } |
423 | 301 | ||
424 | static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti, | 302 | static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti, |
425 | const char *opts) | 303 | const char *opts) |
426 | { | 304 | { |
427 | unsigned bs = crypto_ablkcipher_blocksize(any_tfm(cc)); | 305 | unsigned bs = crypto_ablkcipher_blocksize(cc->tfm); |
428 | int log = ilog2(bs); | 306 | int log = ilog2(bs); |
429 | 307 | ||
430 | /* we need to calculate how far we must shift the sector count | 308 | /* we need to calculate how far we must shift the sector count |
@@ -449,177 +327,25 @@ static void crypt_iv_benbi_dtr(struct crypt_config *cc) | |||
449 | { | 327 | { |
450 | } | 328 | } |
451 | 329 | ||
452 | static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, | 330 | static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, sector_t sector) |
453 | struct dm_crypt_request *dmreq) | ||
454 | { | 331 | { |
455 | __be64 val; | 332 | __be64 val; |
456 | 333 | ||
457 | memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */ | 334 | memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */ |
458 | 335 | ||
459 | val = cpu_to_be64(((u64)dmreq->iv_sector << cc->iv_gen_private.benbi.shift) + 1); | 336 | val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi.shift) + 1); |
460 | put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64))); | 337 | put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64))); |
461 | 338 | ||
462 | return 0; | 339 | return 0; |
463 | } | 340 | } |
464 | 341 | ||
465 | static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv, | 342 | static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv, sector_t sector) |
466 | struct dm_crypt_request *dmreq) | ||
467 | { | 343 | { |
468 | memset(iv, 0, cc->iv_size); | 344 | memset(iv, 0, cc->iv_size); |
469 | 345 | ||
470 | return 0; | 346 | return 0; |
471 | } | 347 | } |
472 | 348 | ||
473 | static void crypt_iv_lmk_dtr(struct crypt_config *cc) | ||
474 | { | ||
475 | struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; | ||
476 | |||
477 | if (lmk->hash_tfm && !IS_ERR(lmk->hash_tfm)) | ||
478 | crypto_free_shash(lmk->hash_tfm); | ||
479 | lmk->hash_tfm = NULL; | ||
480 | |||
481 | kzfree(lmk->seed); | ||
482 | lmk->seed = NULL; | ||
483 | } | ||
484 | |||
485 | static int crypt_iv_lmk_ctr(struct crypt_config *cc, struct dm_target *ti, | ||
486 | const char *opts) | ||
487 | { | ||
488 | struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; | ||
489 | |||
490 | lmk->hash_tfm = crypto_alloc_shash("md5", 0, 0); | ||
491 | if (IS_ERR(lmk->hash_tfm)) { | ||
492 | ti->error = "Error initializing LMK hash"; | ||
493 | return PTR_ERR(lmk->hash_tfm); | ||
494 | } | ||
495 | |||
496 | /* No seed in LMK version 2 */ | ||
497 | if (cc->key_parts == cc->tfms_count) { | ||
498 | lmk->seed = NULL; | ||
499 | return 0; | ||
500 | } | ||
501 | |||
502 | lmk->seed = kzalloc(LMK_SEED_SIZE, GFP_KERNEL); | ||
503 | if (!lmk->seed) { | ||
504 | crypt_iv_lmk_dtr(cc); | ||
505 | ti->error = "Error kmallocing seed storage in LMK"; | ||
506 | return -ENOMEM; | ||
507 | } | ||
508 | |||
509 | return 0; | ||
510 | } | ||
511 | |||
512 | static int crypt_iv_lmk_init(struct crypt_config *cc) | ||
513 | { | ||
514 | struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; | ||
515 | int subkey_size = cc->key_size / cc->key_parts; | ||
516 | |||
517 | /* LMK seed is on the position of LMK_KEYS + 1 key */ | ||
518 | if (lmk->seed) | ||
519 | memcpy(lmk->seed, cc->key + (cc->tfms_count * subkey_size), | ||
520 | crypto_shash_digestsize(lmk->hash_tfm)); | ||
521 | |||
522 | return 0; | ||
523 | } | ||
524 | |||
525 | static int crypt_iv_lmk_wipe(struct crypt_config *cc) | ||
526 | { | ||
527 | struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; | ||
528 | |||
529 | if (lmk->seed) | ||
530 | memset(lmk->seed, 0, LMK_SEED_SIZE); | ||
531 | |||
532 | return 0; | ||
533 | } | ||
534 | |||
535 | static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv, | ||
536 | struct dm_crypt_request *dmreq, | ||
537 | u8 *data) | ||
538 | { | ||
539 | struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; | ||
540 | struct { | ||
541 | struct shash_desc desc; | ||
542 | char ctx[crypto_shash_descsize(lmk->hash_tfm)]; | ||
543 | } sdesc; | ||
544 | struct md5_state md5state; | ||
545 | u32 buf[4]; | ||
546 | int i, r; | ||
547 | |||
548 | sdesc.desc.tfm = lmk->hash_tfm; | ||
549 | sdesc.desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; | ||
550 | |||
551 | r = crypto_shash_init(&sdesc.desc); | ||
552 | if (r) | ||
553 | return r; | ||
554 | |||
555 | if (lmk->seed) { | ||
556 | r = crypto_shash_update(&sdesc.desc, lmk->seed, LMK_SEED_SIZE); | ||
557 | if (r) | ||
558 | return r; | ||
559 | } | ||
560 | |||
561 | /* Sector is always 512B, block size 16, add data of blocks 1-31 */ | ||
562 | r = crypto_shash_update(&sdesc.desc, data + 16, 16 * 31); | ||
563 | if (r) | ||
564 | return r; | ||
565 | |||
566 | /* Sector is cropped to 56 bits here */ | ||
567 | buf[0] = cpu_to_le32(dmreq->iv_sector & 0xFFFFFFFF); | ||
568 | buf[1] = cpu_to_le32((((u64)dmreq->iv_sector >> 32) & 0x00FFFFFF) | 0x80000000); | ||
569 | buf[2] = cpu_to_le32(4024); | ||
570 | buf[3] = 0; | ||
571 | r = crypto_shash_update(&sdesc.desc, (u8 *)buf, sizeof(buf)); | ||
572 | if (r) | ||
573 | return r; | ||
574 | |||
575 | /* No MD5 padding here */ | ||
576 | r = crypto_shash_export(&sdesc.desc, &md5state); | ||
577 | if (r) | ||
578 | return r; | ||
579 | |||
580 | for (i = 0; i < MD5_HASH_WORDS; i++) | ||
581 | __cpu_to_le32s(&md5state.hash[i]); | ||
582 | memcpy(iv, &md5state.hash, cc->iv_size); | ||
583 | |||
584 | return 0; | ||
585 | } | ||
586 | |||
587 | static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv, | ||
588 | struct dm_crypt_request *dmreq) | ||
589 | { | ||
590 | u8 *src; | ||
591 | int r = 0; | ||
592 | |||
593 | if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) { | ||
594 | src = kmap_atomic(sg_page(&dmreq->sg_in), KM_USER0); | ||
595 | r = crypt_iv_lmk_one(cc, iv, dmreq, src + dmreq->sg_in.offset); | ||
596 | kunmap_atomic(src, KM_USER0); | ||
597 | } else | ||
598 | memset(iv, 0, cc->iv_size); | ||
599 | |||
600 | return r; | ||
601 | } | ||
602 | |||
603 | static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv, | ||
604 | struct dm_crypt_request *dmreq) | ||
605 | { | ||
606 | u8 *dst; | ||
607 | int r; | ||
608 | |||
609 | if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) | ||
610 | return 0; | ||
611 | |||
612 | dst = kmap_atomic(sg_page(&dmreq->sg_out), KM_USER0); | ||
613 | r = crypt_iv_lmk_one(cc, iv, dmreq, dst + dmreq->sg_out.offset); | ||
614 | |||
615 | /* Tweak the first block of plaintext sector */ | ||
616 | if (!r) | ||
617 | crypto_xor(dst + dmreq->sg_out.offset, iv, cc->iv_size); | ||
618 | |||
619 | kunmap_atomic(dst, KM_USER0); | ||
620 | return r; | ||
621 | } | ||
622 | |||
623 | static struct crypt_iv_operations crypt_iv_plain_ops = { | 349 | static struct crypt_iv_operations crypt_iv_plain_ops = { |
624 | .generator = crypt_iv_plain_gen | 350 | .generator = crypt_iv_plain_gen |
625 | }; | 351 | }; |
@@ -646,15 +372,6 @@ static struct crypt_iv_operations crypt_iv_null_ops = { | |||
646 | .generator = crypt_iv_null_gen | 372 | .generator = crypt_iv_null_gen |
647 | }; | 373 | }; |
648 | 374 | ||
649 | static struct crypt_iv_operations crypt_iv_lmk_ops = { | ||
650 | .ctr = crypt_iv_lmk_ctr, | ||
651 | .dtr = crypt_iv_lmk_dtr, | ||
652 | .init = crypt_iv_lmk_init, | ||
653 | .wipe = crypt_iv_lmk_wipe, | ||
654 | .generator = crypt_iv_lmk_gen, | ||
655 | .post = crypt_iv_lmk_post | ||
656 | }; | ||
657 | |||
658 | static void crypt_convert_init(struct crypt_config *cc, | 375 | static void crypt_convert_init(struct crypt_config *cc, |
659 | struct convert_context *ctx, | 376 | struct convert_context *ctx, |
660 | struct bio *bio_out, struct bio *bio_in, | 377 | struct bio *bio_out, struct bio *bio_in, |
@@ -682,13 +399,6 @@ static struct ablkcipher_request *req_of_dmreq(struct crypt_config *cc, | |||
682 | return (struct ablkcipher_request *)((char *)dmreq - cc->dmreq_start); | 399 | return (struct ablkcipher_request *)((char *)dmreq - cc->dmreq_start); |
683 | } | 400 | } |
684 | 401 | ||
685 | static u8 *iv_of_dmreq(struct crypt_config *cc, | ||
686 | struct dm_crypt_request *dmreq) | ||
687 | { | ||
688 | return (u8 *)ALIGN((unsigned long)(dmreq + 1), | ||
689 | crypto_ablkcipher_alignmask(any_tfm(cc)) + 1); | ||
690 | } | ||
691 | |||
692 | static int crypt_convert_block(struct crypt_config *cc, | 402 | static int crypt_convert_block(struct crypt_config *cc, |
693 | struct convert_context *ctx, | 403 | struct convert_context *ctx, |
694 | struct ablkcipher_request *req) | 404 | struct ablkcipher_request *req) |
@@ -700,9 +410,9 @@ static int crypt_convert_block(struct crypt_config *cc, | |||
700 | int r = 0; | 410 | int r = 0; |
701 | 411 | ||
702 | dmreq = dmreq_of_req(cc, req); | 412 | dmreq = dmreq_of_req(cc, req); |
703 | iv = iv_of_dmreq(cc, dmreq); | 413 | iv = (u8 *)ALIGN((unsigned long)(dmreq + 1), |
414 | crypto_ablkcipher_alignmask(cc->tfm) + 1); | ||
704 | 415 | ||
705 | dmreq->iv_sector = ctx->sector; | ||
706 | dmreq->ctx = ctx; | 416 | dmreq->ctx = ctx; |
707 | sg_init_table(&dmreq->sg_in, 1); | 417 | sg_init_table(&dmreq->sg_in, 1); |
708 | sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT, | 418 | sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT, |
@@ -725,7 +435,7 @@ static int crypt_convert_block(struct crypt_config *cc, | |||
725 | } | 435 | } |
726 | 436 | ||
727 | if (cc->iv_gen_ops) { | 437 | if (cc->iv_gen_ops) { |
728 | r = cc->iv_gen_ops->generator(cc, iv, dmreq); | 438 | r = cc->iv_gen_ops->generator(cc, iv, ctx->sector); |
729 | if (r < 0) | 439 | if (r < 0) |
730 | return r; | 440 | return r; |
731 | } | 441 | } |
@@ -738,28 +448,21 @@ static int crypt_convert_block(struct crypt_config *cc, | |||
738 | else | 448 | else |
739 | r = crypto_ablkcipher_decrypt(req); | 449 | r = crypto_ablkcipher_decrypt(req); |
740 | 450 | ||
741 | if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post) | ||
742 | r = cc->iv_gen_ops->post(cc, iv, dmreq); | ||
743 | |||
744 | return r; | 451 | return r; |
745 | } | 452 | } |
746 | 453 | ||
747 | static void kcryptd_async_done(struct crypto_async_request *async_req, | 454 | static void kcryptd_async_done(struct crypto_async_request *async_req, |
748 | int error); | 455 | int error); |
749 | |||
750 | static void crypt_alloc_req(struct crypt_config *cc, | 456 | static void crypt_alloc_req(struct crypt_config *cc, |
751 | struct convert_context *ctx) | 457 | struct convert_context *ctx) |
752 | { | 458 | { |
753 | struct crypt_cpu *this_cc = this_crypt_config(cc); | 459 | if (!cc->req) |
754 | unsigned key_index = ctx->sector & (cc->tfms_count - 1); | 460 | cc->req = mempool_alloc(cc->req_pool, GFP_NOIO); |
755 | 461 | ablkcipher_request_set_tfm(cc->req, cc->tfm); | |
756 | if (!this_cc->req) | 462 | ablkcipher_request_set_callback(cc->req, CRYPTO_TFM_REQ_MAY_BACKLOG | |
757 | this_cc->req = mempool_alloc(cc->req_pool, GFP_NOIO); | 463 | CRYPTO_TFM_REQ_MAY_SLEEP, |
758 | 464 | kcryptd_async_done, | |
759 | ablkcipher_request_set_tfm(this_cc->req, this_cc->tfms[key_index]); | 465 | dmreq_of_req(cc, cc->req)); |
760 | ablkcipher_request_set_callback(this_cc->req, | ||
761 | CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, | ||
762 | kcryptd_async_done, dmreq_of_req(cc, this_cc->req)); | ||
763 | } | 466 | } |
764 | 467 | ||
765 | /* | 468 | /* |
@@ -768,7 +471,6 @@ static void crypt_alloc_req(struct crypt_config *cc, | |||
768 | static int crypt_convert(struct crypt_config *cc, | 471 | static int crypt_convert(struct crypt_config *cc, |
769 | struct convert_context *ctx) | 472 | struct convert_context *ctx) |
770 | { | 473 | { |
771 | struct crypt_cpu *this_cc = this_crypt_config(cc); | ||
772 | int r; | 474 | int r; |
773 | 475 | ||
774 | atomic_set(&ctx->pending, 1); | 476 | atomic_set(&ctx->pending, 1); |
@@ -780,7 +482,7 @@ static int crypt_convert(struct crypt_config *cc, | |||
780 | 482 | ||
781 | atomic_inc(&ctx->pending); | 483 | atomic_inc(&ctx->pending); |
782 | 484 | ||
783 | r = crypt_convert_block(cc, ctx, this_cc->req); | 485 | r = crypt_convert_block(cc, ctx, cc->req); |
784 | 486 | ||
785 | switch (r) { | 487 | switch (r) { |
786 | /* async */ | 488 | /* async */ |
@@ -789,7 +491,7 @@ static int crypt_convert(struct crypt_config *cc, | |||
789 | INIT_COMPLETION(ctx->restart); | 491 | INIT_COMPLETION(ctx->restart); |
790 | /* fall through*/ | 492 | /* fall through*/ |
791 | case -EINPROGRESS: | 493 | case -EINPROGRESS: |
792 | this_cc->req = NULL; | 494 | cc->req = NULL; |
793 | ctx->sector++; | 495 | ctx->sector++; |
794 | continue; | 496 | continue; |
795 | 497 | ||
@@ -948,9 +650,6 @@ static void crypt_dec_pending(struct dm_crypt_io *io) | |||
948 | * They must be separated as otherwise the final stages could be | 650 | * They must be separated as otherwise the final stages could be |
949 | * starved by new requests which can block in the first stages due | 651 | * starved by new requests which can block in the first stages due |
950 | * to memory allocation. | 652 | * to memory allocation. |
951 | * | ||
952 | * The work is done per CPU global for all dm-crypt instances. | ||
953 | * They should not depend on each other and do not block. | ||
954 | */ | 653 | */ |
955 | static void crypt_endio(struct bio *clone, int error) | 654 | static void crypt_endio(struct bio *clone, int error) |
956 | { | 655 | { |
@@ -991,22 +690,25 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone) | |||
991 | clone->bi_destructor = dm_crypt_bio_destructor; | 690 | clone->bi_destructor = dm_crypt_bio_destructor; |
992 | } | 691 | } |
993 | 692 | ||
994 | static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp) | 693 | static void kcryptd_io_read(struct dm_crypt_io *io) |
995 | { | 694 | { |
996 | struct crypt_config *cc = io->target->private; | 695 | struct crypt_config *cc = io->target->private; |
997 | struct bio *base_bio = io->base_bio; | 696 | struct bio *base_bio = io->base_bio; |
998 | struct bio *clone; | 697 | struct bio *clone; |
999 | 698 | ||
699 | crypt_inc_pending(io); | ||
700 | |||
1000 | /* | 701 | /* |
1001 | * The block layer might modify the bvec array, so always | 702 | * The block layer might modify the bvec array, so always |
1002 | * copy the required bvecs because we need the original | 703 | * copy the required bvecs because we need the original |
1003 | * one in order to decrypt the whole bio data *afterwards*. | 704 | * one in order to decrypt the whole bio data *afterwards*. |
1004 | */ | 705 | */ |
1005 | clone = bio_alloc_bioset(gfp, bio_segments(base_bio), cc->bs); | 706 | clone = bio_alloc_bioset(GFP_NOIO, bio_segments(base_bio), cc->bs); |
1006 | if (!clone) | 707 | if (unlikely(!clone)) { |
1007 | return 1; | 708 | io->error = -ENOMEM; |
1008 | 709 | crypt_dec_pending(io); | |
1009 | crypt_inc_pending(io); | 710 | return; |
711 | } | ||
1010 | 712 | ||
1011 | clone_init(io, clone); | 713 | clone_init(io, clone); |
1012 | clone->bi_idx = 0; | 714 | clone->bi_idx = 0; |
@@ -1017,7 +719,6 @@ static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp) | |||
1017 | sizeof(struct bio_vec) * clone->bi_vcnt); | 719 | sizeof(struct bio_vec) * clone->bi_vcnt); |
1018 | 720 | ||
1019 | generic_make_request(clone); | 721 | generic_make_request(clone); |
1020 | return 0; | ||
1021 | } | 722 | } |
1022 | 723 | ||
1023 | static void kcryptd_io_write(struct dm_crypt_io *io) | 724 | static void kcryptd_io_write(struct dm_crypt_io *io) |
@@ -1030,12 +731,9 @@ static void kcryptd_io(struct work_struct *work) | |||
1030 | { | 731 | { |
1031 | struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work); | 732 | struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work); |
1032 | 733 | ||
1033 | if (bio_data_dir(io->base_bio) == READ) { | 734 | if (bio_data_dir(io->base_bio) == READ) |
1034 | crypt_inc_pending(io); | 735 | kcryptd_io_read(io); |
1035 | if (kcryptd_io_read(io, GFP_NOIO)) | 736 | else |
1036 | io->error = -ENOMEM; | ||
1037 | crypt_dec_pending(io); | ||
1038 | } else | ||
1039 | kcryptd_io_write(io); | 737 | kcryptd_io_write(io); |
1040 | } | 738 | } |
1041 | 739 | ||
@@ -1202,9 +900,6 @@ static void kcryptd_async_done(struct crypto_async_request *async_req, | |||
1202 | return; | 900 | return; |
1203 | } | 901 | } |
1204 | 902 | ||
1205 | if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post) | ||
1206 | error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq); | ||
1207 | |||
1208 | mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool); | 903 | mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool); |
1209 | 904 | ||
1210 | if (!atomic_dec_and_test(&ctx->pending)) | 905 | if (!atomic_dec_and_test(&ctx->pending)) |
@@ -1275,93 +970,34 @@ static void crypt_encode_key(char *hex, u8 *key, unsigned int size) | |||
1275 | } | 970 | } |
1276 | } | 971 | } |
1277 | 972 | ||
1278 | static void crypt_free_tfms(struct crypt_config *cc, int cpu) | ||
1279 | { | ||
1280 | struct crypt_cpu *cpu_cc = per_cpu_ptr(cc->cpu, cpu); | ||
1281 | unsigned i; | ||
1282 | |||
1283 | for (i = 0; i < cc->tfms_count; i++) | ||
1284 | if (cpu_cc->tfms[i] && !IS_ERR(cpu_cc->tfms[i])) { | ||
1285 | crypto_free_ablkcipher(cpu_cc->tfms[i]); | ||
1286 | cpu_cc->tfms[i] = NULL; | ||
1287 | } | ||
1288 | } | ||
1289 | |||
1290 | static int crypt_alloc_tfms(struct crypt_config *cc, int cpu, char *ciphermode) | ||
1291 | { | ||
1292 | struct crypt_cpu *cpu_cc = per_cpu_ptr(cc->cpu, cpu); | ||
1293 | unsigned i; | ||
1294 | int err; | ||
1295 | |||
1296 | for (i = 0; i < cc->tfms_count; i++) { | ||
1297 | cpu_cc->tfms[i] = crypto_alloc_ablkcipher(ciphermode, 0, 0); | ||
1298 | if (IS_ERR(cpu_cc->tfms[i])) { | ||
1299 | err = PTR_ERR(cpu_cc->tfms[i]); | ||
1300 | crypt_free_tfms(cc, cpu); | ||
1301 | return err; | ||
1302 | } | ||
1303 | } | ||
1304 | |||
1305 | return 0; | ||
1306 | } | ||
1307 | |||
1308 | static int crypt_setkey_allcpus(struct crypt_config *cc) | ||
1309 | { | ||
1310 | unsigned subkey_size = cc->key_size >> ilog2(cc->tfms_count); | ||
1311 | int cpu, err = 0, i, r; | ||
1312 | |||
1313 | for_each_possible_cpu(cpu) { | ||
1314 | for (i = 0; i < cc->tfms_count; i++) { | ||
1315 | r = crypto_ablkcipher_setkey(per_cpu_ptr(cc->cpu, cpu)->tfms[i], | ||
1316 | cc->key + (i * subkey_size), subkey_size); | ||
1317 | if (r) | ||
1318 | err = r; | ||
1319 | } | ||
1320 | } | ||
1321 | |||
1322 | return err; | ||
1323 | } | ||
1324 | |||
1325 | static int crypt_set_key(struct crypt_config *cc, char *key) | 973 | static int crypt_set_key(struct crypt_config *cc, char *key) |
1326 | { | 974 | { |
1327 | int r = -EINVAL; | ||
1328 | int key_string_len = strlen(key); | ||
1329 | |||
1330 | /* The key size may not be changed. */ | 975 | /* The key size may not be changed. */ |
1331 | if (cc->key_size != (key_string_len >> 1)) | 976 | if (cc->key_size != (strlen(key) >> 1)) |
1332 | goto out; | 977 | return -EINVAL; |
1333 | 978 | ||
1334 | /* Hyphen (which gives a key_size of zero) means there is no key. */ | 979 | /* Hyphen (which gives a key_size of zero) means there is no key. */ |
1335 | if (!cc->key_size && strcmp(key, "-")) | 980 | if (!cc->key_size && strcmp(key, "-")) |
1336 | goto out; | 981 | return -EINVAL; |
1337 | 982 | ||
1338 | if (cc->key_size && crypt_decode_key(cc->key, key, cc->key_size) < 0) | 983 | if (cc->key_size && crypt_decode_key(cc->key, key, cc->key_size) < 0) |
1339 | goto out; | 984 | return -EINVAL; |
1340 | 985 | ||
1341 | set_bit(DM_CRYPT_KEY_VALID, &cc->flags); | 986 | set_bit(DM_CRYPT_KEY_VALID, &cc->flags); |
1342 | 987 | ||
1343 | r = crypt_setkey_allcpus(cc); | 988 | return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size); |
1344 | |||
1345 | out: | ||
1346 | /* Hex key string not needed after here, so wipe it. */ | ||
1347 | memset(key, '0', key_string_len); | ||
1348 | |||
1349 | return r; | ||
1350 | } | 989 | } |
1351 | 990 | ||
1352 | static int crypt_wipe_key(struct crypt_config *cc) | 991 | static int crypt_wipe_key(struct crypt_config *cc) |
1353 | { | 992 | { |
1354 | clear_bit(DM_CRYPT_KEY_VALID, &cc->flags); | 993 | clear_bit(DM_CRYPT_KEY_VALID, &cc->flags); |
1355 | memset(&cc->key, 0, cc->key_size * sizeof(u8)); | 994 | memset(&cc->key, 0, cc->key_size * sizeof(u8)); |
1356 | 995 | return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size); | |
1357 | return crypt_setkey_allcpus(cc); | ||
1358 | } | 996 | } |
1359 | 997 | ||
1360 | static void crypt_dtr(struct dm_target *ti) | 998 | static void crypt_dtr(struct dm_target *ti) |
1361 | { | 999 | { |
1362 | struct crypt_config *cc = ti->private; | 1000 | struct crypt_config *cc = ti->private; |
1363 | struct crypt_cpu *cpu_cc; | ||
1364 | int cpu; | ||
1365 | 1001 | ||
1366 | ti->private = NULL; | 1002 | ti->private = NULL; |
1367 | 1003 | ||
@@ -1373,14 +1009,6 @@ static void crypt_dtr(struct dm_target *ti) | |||
1373 | if (cc->crypt_queue) | 1009 | if (cc->crypt_queue) |
1374 | destroy_workqueue(cc->crypt_queue); | 1010 | destroy_workqueue(cc->crypt_queue); |
1375 | 1011 | ||
1376 | if (cc->cpu) | ||
1377 | for_each_possible_cpu(cpu) { | ||
1378 | cpu_cc = per_cpu_ptr(cc->cpu, cpu); | ||
1379 | if (cpu_cc->req) | ||
1380 | mempool_free(cpu_cc->req, cc->req_pool); | ||
1381 | crypt_free_tfms(cc, cpu); | ||
1382 | } | ||
1383 | |||
1384 | if (cc->bs) | 1012 | if (cc->bs) |
1385 | bioset_free(cc->bs); | 1013 | bioset_free(cc->bs); |
1386 | 1014 | ||
@@ -1394,12 +1022,12 @@ static void crypt_dtr(struct dm_target *ti) | |||
1394 | if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) | 1022 | if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) |
1395 | cc->iv_gen_ops->dtr(cc); | 1023 | cc->iv_gen_ops->dtr(cc); |
1396 | 1024 | ||
1025 | if (cc->tfm && !IS_ERR(cc->tfm)) | ||
1026 | crypto_free_ablkcipher(cc->tfm); | ||
1027 | |||
1397 | if (cc->dev) | 1028 | if (cc->dev) |
1398 | dm_put_device(ti, cc->dev); | 1029 | dm_put_device(ti, cc->dev); |
1399 | 1030 | ||
1400 | if (cc->cpu) | ||
1401 | free_percpu(cc->cpu); | ||
1402 | |||
1403 | kzfree(cc->cipher); | 1031 | kzfree(cc->cipher); |
1404 | kzfree(cc->cipher_string); | 1032 | kzfree(cc->cipher_string); |
1405 | 1033 | ||
@@ -1411,9 +1039,9 @@ static int crypt_ctr_cipher(struct dm_target *ti, | |||
1411 | char *cipher_in, char *key) | 1039 | char *cipher_in, char *key) |
1412 | { | 1040 | { |
1413 | struct crypt_config *cc = ti->private; | 1041 | struct crypt_config *cc = ti->private; |
1414 | char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount; | 1042 | char *tmp, *cipher, *chainmode, *ivmode, *ivopts; |
1415 | char *cipher_api = NULL; | 1043 | char *cipher_api = NULL; |
1416 | int cpu, ret = -EINVAL; | 1044 | int ret = -EINVAL; |
1417 | 1045 | ||
1418 | /* Convert to crypto api definition? */ | 1046 | /* Convert to crypto api definition? */ |
1419 | if (strchr(cipher_in, '(')) { | 1047 | if (strchr(cipher_in, '(')) { |
@@ -1427,20 +1055,10 @@ static int crypt_ctr_cipher(struct dm_target *ti, | |||
1427 | 1055 | ||
1428 | /* | 1056 | /* |
1429 | * Legacy dm-crypt cipher specification | 1057 | * Legacy dm-crypt cipher specification |
1430 | * cipher[:keycount]-mode-iv:ivopts | 1058 | * cipher-mode-iv:ivopts |
1431 | */ | 1059 | */ |
1432 | tmp = cipher_in; | 1060 | tmp = cipher_in; |
1433 | keycount = strsep(&tmp, "-"); | 1061 | cipher = strsep(&tmp, "-"); |
1434 | cipher = strsep(&keycount, ":"); | ||
1435 | |||
1436 | if (!keycount) | ||
1437 | cc->tfms_count = 1; | ||
1438 | else if (sscanf(keycount, "%u", &cc->tfms_count) != 1 || | ||
1439 | !is_power_of_2(cc->tfms_count)) { | ||
1440 | ti->error = "Bad cipher key count specification"; | ||
1441 | return -EINVAL; | ||
1442 | } | ||
1443 | cc->key_parts = cc->tfms_count; | ||
1444 | 1062 | ||
1445 | cc->cipher = kstrdup(cipher, GFP_KERNEL); | 1063 | cc->cipher = kstrdup(cipher, GFP_KERNEL); |
1446 | if (!cc->cipher) | 1064 | if (!cc->cipher) |
@@ -1453,14 +1071,6 @@ static int crypt_ctr_cipher(struct dm_target *ti, | |||
1453 | if (tmp) | 1071 | if (tmp) |
1454 | DMWARN("Ignoring unexpected additional cipher options"); | 1072 | DMWARN("Ignoring unexpected additional cipher options"); |
1455 | 1073 | ||
1456 | cc->cpu = __alloc_percpu(sizeof(*(cc->cpu)) + | ||
1457 | cc->tfms_count * sizeof(*(cc->cpu->tfms)), | ||
1458 | __alignof__(struct crypt_cpu)); | ||
1459 | if (!cc->cpu) { | ||
1460 | ti->error = "Cannot allocate per cpu state"; | ||
1461 | goto bad_mem; | ||
1462 | } | ||
1463 | |||
1464 | /* | 1074 | /* |
1465 | * For compatibility with the original dm-crypt mapping format, if | 1075 | * For compatibility with the original dm-crypt mapping format, if |
1466 | * only the cipher name is supplied, use cbc-plain. | 1076 | * only the cipher name is supplied, use cbc-plain. |
@@ -1487,12 +1097,11 @@ static int crypt_ctr_cipher(struct dm_target *ti, | |||
1487 | } | 1097 | } |
1488 | 1098 | ||
1489 | /* Allocate cipher */ | 1099 | /* Allocate cipher */ |
1490 | for_each_possible_cpu(cpu) { | 1100 | cc->tfm = crypto_alloc_ablkcipher(cipher_api, 0, 0); |
1491 | ret = crypt_alloc_tfms(cc, cpu, cipher_api); | 1101 | if (IS_ERR(cc->tfm)) { |
1492 | if (ret < 0) { | 1102 | ret = PTR_ERR(cc->tfm); |
1493 | ti->error = "Error allocating crypto tfm"; | 1103 | ti->error = "Error allocating crypto tfm"; |
1494 | goto bad; | 1104 | goto bad; |
1495 | } | ||
1496 | } | 1105 | } |
1497 | 1106 | ||
1498 | /* Initialize and set key */ | 1107 | /* Initialize and set key */ |
@@ -1503,7 +1112,7 @@ static int crypt_ctr_cipher(struct dm_target *ti, | |||
1503 | } | 1112 | } |
1504 | 1113 | ||
1505 | /* Initialize IV */ | 1114 | /* Initialize IV */ |
1506 | cc->iv_size = crypto_ablkcipher_ivsize(any_tfm(cc)); | 1115 | cc->iv_size = crypto_ablkcipher_ivsize(cc->tfm); |
1507 | if (cc->iv_size) | 1116 | if (cc->iv_size) |
1508 | /* at least a 64 bit sector number should fit in our buffer */ | 1117 | /* at least a 64 bit sector number should fit in our buffer */ |
1509 | cc->iv_size = max(cc->iv_size, | 1118 | cc->iv_size = max(cc->iv_size, |
@@ -1526,15 +1135,7 @@ static int crypt_ctr_cipher(struct dm_target *ti, | |||
1526 | cc->iv_gen_ops = &crypt_iv_benbi_ops; | 1135 | cc->iv_gen_ops = &crypt_iv_benbi_ops; |
1527 | else if (strcmp(ivmode, "null") == 0) | 1136 | else if (strcmp(ivmode, "null") == 0) |
1528 | cc->iv_gen_ops = &crypt_iv_null_ops; | 1137 | cc->iv_gen_ops = &crypt_iv_null_ops; |
1529 | else if (strcmp(ivmode, "lmk") == 0) { | 1138 | else { |
1530 | cc->iv_gen_ops = &crypt_iv_lmk_ops; | ||
1531 | /* Version 2 and 3 is recognised according | ||
1532 | * to length of provided multi-key string. | ||
1533 | * If present (version 3), last key is used as IV seed. | ||
1534 | */ | ||
1535 | if (cc->key_size % cc->key_parts) | ||
1536 | cc->key_parts++; | ||
1537 | } else { | ||
1538 | ret = -EINVAL; | 1139 | ret = -EINVAL; |
1539 | ti->error = "Invalid IV mode"; | 1140 | ti->error = "Invalid IV mode"; |
1540 | goto bad; | 1141 | goto bad; |
@@ -1575,11 +1176,17 @@ bad_mem: | |||
1575 | static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | 1176 | static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) |
1576 | { | 1177 | { |
1577 | struct crypt_config *cc; | 1178 | struct crypt_config *cc; |
1578 | unsigned int key_size; | 1179 | unsigned int key_size, opt_params; |
1579 | unsigned long long tmpll; | 1180 | unsigned long long tmpll; |
1580 | int ret; | 1181 | int ret; |
1182 | struct dm_arg_set as; | ||
1183 | const char *opt_string; | ||
1184 | |||
1185 | static struct dm_arg _args[] = { | ||
1186 | {0, 1, "Invalid number of feature args"}, | ||
1187 | }; | ||
1581 | 1188 | ||
1582 | if (argc != 5) { | 1189 | if (argc < 5) { |
1583 | ti->error = "Not enough arguments"; | 1190 | ti->error = "Not enough arguments"; |
1584 | return -EINVAL; | 1191 | return -EINVAL; |
1585 | } | 1192 | } |
@@ -1606,9 +1213,9 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1606 | } | 1213 | } |
1607 | 1214 | ||
1608 | cc->dmreq_start = sizeof(struct ablkcipher_request); | 1215 | cc->dmreq_start = sizeof(struct ablkcipher_request); |
1609 | cc->dmreq_start += crypto_ablkcipher_reqsize(any_tfm(cc)); | 1216 | cc->dmreq_start += crypto_ablkcipher_reqsize(cc->tfm); |
1610 | cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment()); | 1217 | cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment()); |
1611 | cc->dmreq_start += crypto_ablkcipher_alignmask(any_tfm(cc)) & | 1218 | cc->dmreq_start += crypto_ablkcipher_alignmask(cc->tfm) & |
1612 | ~(crypto_tfm_ctx_alignment() - 1); | 1219 | ~(crypto_tfm_ctx_alignment() - 1); |
1613 | 1220 | ||
1614 | cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start + | 1221 | cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start + |
@@ -1617,6 +1224,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1617 | ti->error = "Cannot allocate crypt request mempool"; | 1224 | ti->error = "Cannot allocate crypt request mempool"; |
1618 | goto bad; | 1225 | goto bad; |
1619 | } | 1226 | } |
1227 | cc->req = NULL; | ||
1620 | 1228 | ||
1621 | cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0); | 1229 | cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0); |
1622 | if (!cc->page_pool) { | 1230 | if (!cc->page_pool) { |
@@ -1648,27 +1256,46 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1648 | } | 1256 | } |
1649 | cc->start = tmpll; | 1257 | cc->start = tmpll; |
1650 | 1258 | ||
1259 | argv += 5; | ||
1260 | argc -= 5; | ||
1261 | |||
1262 | /* Optional parameters */ | ||
1263 | if (argc) { | ||
1264 | as.argc = argc; | ||
1265 | as.argv = argv; | ||
1266 | |||
1267 | ret = dm_read_arg_group(_args, &as, &opt_params, &ti->error); | ||
1268 | if (ret) | ||
1269 | goto bad; | ||
1270 | |||
1271 | opt_string = dm_shift_arg(&as); | ||
1272 | |||
1273 | if (opt_params == 1 && opt_string && | ||
1274 | !strcasecmp(opt_string, "allow_discards")) | ||
1275 | ti->num_discard_requests = 1; | ||
1276 | else if (opt_params) { | ||
1277 | ret = -EINVAL; | ||
1278 | ti->error = "Invalid feature arguments"; | ||
1279 | goto bad; | ||
1280 | } | ||
1281 | } | ||
1282 | |||
1651 | ret = -ENOMEM; | 1283 | ret = -ENOMEM; |
1652 | cc->io_queue = alloc_workqueue("kcryptd_io", | 1284 | cc->io_queue = create_singlethread_workqueue("kcryptd_io"); |
1653 | WQ_NON_REENTRANT| | ||
1654 | WQ_MEM_RECLAIM, | ||
1655 | 1); | ||
1656 | if (!cc->io_queue) { | 1285 | if (!cc->io_queue) { |
1657 | ti->error = "Couldn't create kcryptd io queue"; | 1286 | ti->error = "Couldn't create kcryptd io queue"; |
1658 | goto bad; | 1287 | goto bad; |
1659 | } | 1288 | } |
1660 | 1289 | ||
1661 | cc->crypt_queue = alloc_workqueue("kcryptd", | 1290 | cc->crypt_queue = create_singlethread_workqueue("kcryptd"); |
1662 | WQ_NON_REENTRANT| | ||
1663 | WQ_CPU_INTENSIVE| | ||
1664 | WQ_MEM_RECLAIM, | ||
1665 | 1); | ||
1666 | if (!cc->crypt_queue) { | 1291 | if (!cc->crypt_queue) { |
1667 | ti->error = "Couldn't create kcryptd queue"; | 1292 | ti->error = "Couldn't create kcryptd queue"; |
1668 | goto bad; | 1293 | goto bad; |
1669 | } | 1294 | } |
1670 | 1295 | ||
1671 | ti->num_flush_requests = 1; | 1296 | ti->num_flush_requests = 1; |
1297 | ti->discard_zeroes_data_unsupported = 1; | ||
1298 | |||
1672 | return 0; | 1299 | return 0; |
1673 | 1300 | ||
1674 | bad: | 1301 | bad: |
@@ -1682,18 +1309,24 @@ static int crypt_map(struct dm_target *ti, struct bio *bio, | |||
1682 | struct dm_crypt_io *io; | 1309 | struct dm_crypt_io *io; |
1683 | struct crypt_config *cc; | 1310 | struct crypt_config *cc; |
1684 | 1311 | ||
1685 | if (bio->bi_rw & REQ_FLUSH) { | 1312 | /* |
1313 | * If bio is REQ_FLUSH or REQ_DISCARD, just bypass crypt queues. | ||
1314 | * - for REQ_FLUSH device-mapper core ensures that no IO is in-flight | ||
1315 | * - for REQ_DISCARD caller must use flush if IO ordering matters | ||
1316 | */ | ||
1317 | if (unlikely(bio->bi_rw & (REQ_FLUSH | REQ_DISCARD))) { | ||
1686 | cc = ti->private; | 1318 | cc = ti->private; |
1687 | bio->bi_bdev = cc->dev->bdev; | 1319 | bio->bi_bdev = cc->dev->bdev; |
1320 | if (bio_sectors(bio)) | ||
1321 | bio->bi_sector = cc->start + dm_target_offset(ti, bio->bi_sector); | ||
1688 | return DM_MAPIO_REMAPPED; | 1322 | return DM_MAPIO_REMAPPED; |
1689 | } | 1323 | } |
1690 | 1324 | ||
1691 | io = crypt_io_alloc(ti, bio, dm_target_offset(ti, bio->bi_sector)); | 1325 | io = crypt_io_alloc(ti, bio, dm_target_offset(ti, bio->bi_sector)); |
1692 | 1326 | ||
1693 | if (bio_data_dir(io->base_bio) == READ) { | 1327 | if (bio_data_dir(io->base_bio) == READ) |
1694 | if (kcryptd_io_read(io, GFP_NOWAIT)) | 1328 | kcryptd_queue_io(io); |
1695 | kcryptd_queue_io(io); | 1329 | else |
1696 | } else | ||
1697 | kcryptd_queue_crypt(io); | 1330 | kcryptd_queue_crypt(io); |
1698 | 1331 | ||
1699 | return DM_MAPIO_SUBMITTED; | 1332 | return DM_MAPIO_SUBMITTED; |
@@ -1727,6 +1360,10 @@ static int crypt_status(struct dm_target *ti, status_type_t type, | |||
1727 | 1360 | ||
1728 | DMEMIT(" %llu %s %llu", (unsigned long long)cc->iv_offset, | 1361 | DMEMIT(" %llu %s %llu", (unsigned long long)cc->iv_offset, |
1729 | cc->dev->name, (unsigned long long)cc->start); | 1362 | cc->dev->name, (unsigned long long)cc->start); |
1363 | |||
1364 | if (ti->num_discard_requests) | ||
1365 | DMEMIT(" 1 allow_discards"); | ||
1366 | |||
1730 | break; | 1367 | break; |
1731 | } | 1368 | } |
1732 | return 0; | 1369 | return 0; |
@@ -1770,12 +1407,12 @@ static int crypt_message(struct dm_target *ti, unsigned argc, char **argv) | |||
1770 | if (argc < 2) | 1407 | if (argc < 2) |
1771 | goto error; | 1408 | goto error; |
1772 | 1409 | ||
1773 | if (!strnicmp(argv[0], MESG_STR("key"))) { | 1410 | if (!strcasecmp(argv[0], "key")) { |
1774 | if (!test_bit(DM_CRYPT_SUSPENDED, &cc->flags)) { | 1411 | if (!test_bit(DM_CRYPT_SUSPENDED, &cc->flags)) { |
1775 | DMWARN("not suspended during key manipulation."); | 1412 | DMWARN("not suspended during key manipulation."); |
1776 | return -EINVAL; | 1413 | return -EINVAL; |
1777 | } | 1414 | } |
1778 | if (argc == 3 && !strnicmp(argv[1], MESG_STR("set"))) { | 1415 | if (argc == 3 && !strcasecmp(argv[1], "set")) { |
1779 | ret = crypt_set_key(cc, argv[2]); | 1416 | ret = crypt_set_key(cc, argv[2]); |
1780 | if (ret) | 1417 | if (ret) |
1781 | return ret; | 1418 | return ret; |
@@ -1783,7 +1420,7 @@ static int crypt_message(struct dm_target *ti, unsigned argc, char **argv) | |||
1783 | ret = cc->iv_gen_ops->init(cc); | 1420 | ret = cc->iv_gen_ops->init(cc); |
1784 | return ret; | 1421 | return ret; |
1785 | } | 1422 | } |
1786 | if (argc == 2 && !strnicmp(argv[1], MESG_STR("wipe"))) { | 1423 | if (argc == 2 && !strcasecmp(argv[1], "wipe")) { |
1787 | if (cc->iv_gen_ops && cc->iv_gen_ops->wipe) { | 1424 | if (cc->iv_gen_ops && cc->iv_gen_ops->wipe) { |
1788 | ret = cc->iv_gen_ops->wipe(cc); | 1425 | ret = cc->iv_gen_ops->wipe(cc); |
1789 | if (ret) | 1426 | if (ret) |
@@ -1823,7 +1460,7 @@ static int crypt_iterate_devices(struct dm_target *ti, | |||
1823 | 1460 | ||
1824 | static struct target_type crypt_target = { | 1461 | static struct target_type crypt_target = { |
1825 | .name = "crypt", | 1462 | .name = "crypt", |
1826 | .version = {1, 10, 0}, | 1463 | .version = {1, 8, 0}, |
1827 | .module = THIS_MODULE, | 1464 | .module = THIS_MODULE, |
1828 | .ctr = crypt_ctr, | 1465 | .ctr = crypt_ctr, |
1829 | .dtr = crypt_dtr, | 1466 | .dtr = crypt_dtr, |
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index ea790623c30..f84c08029b2 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c | |||
@@ -1,6 +1,6 @@ | |||
1 | /* | 1 | /* |
2 | * Copyright (C) 2003 Sistina Software (UK) Limited. | 2 | * Copyright (C) 2003 Sistina Software (UK) Limited. |
3 | * Copyright (C) 2004, 2010 Red Hat, Inc. All rights reserved. | 3 | * Copyright (C) 2004, 2010-2011 Red Hat, Inc. All rights reserved. |
4 | * | 4 | * |
5 | * This file is released under the GPL. | 5 | * This file is released under the GPL. |
6 | */ | 6 | */ |
@@ -15,6 +15,9 @@ | |||
15 | 15 | ||
16 | #define DM_MSG_PREFIX "flakey" | 16 | #define DM_MSG_PREFIX "flakey" |
17 | 17 | ||
18 | #define all_corrupt_bio_flags_match(bio, fc) \ | ||
19 | (((bio)->bi_rw & (fc)->corrupt_bio_flags) == (fc)->corrupt_bio_flags) | ||
20 | |||
18 | /* | 21 | /* |
19 | * Flakey: Used for testing only, simulates intermittent, | 22 | * Flakey: Used for testing only, simulates intermittent, |
20 | * catastrophic device failure. | 23 | * catastrophic device failure. |
@@ -25,60 +28,191 @@ struct flakey_c { | |||
25 | sector_t start; | 28 | sector_t start; |
26 | unsigned up_interval; | 29 | unsigned up_interval; |
27 | unsigned down_interval; | 30 | unsigned down_interval; |
31 | unsigned long flags; | ||
32 | unsigned corrupt_bio_byte; | ||
33 | unsigned corrupt_bio_rw; | ||
34 | unsigned corrupt_bio_value; | ||
35 | unsigned corrupt_bio_flags; | ||
36 | }; | ||
37 | |||
38 | enum feature_flag_bits { | ||
39 | DROP_WRITES | ||
28 | }; | 40 | }; |
29 | 41 | ||
42 | static int parse_features(struct dm_arg_set *as, struct flakey_c *fc, | ||
43 | struct dm_target *ti) | ||
44 | { | ||
45 | int r; | ||
46 | unsigned argc; | ||
47 | const char *arg_name; | ||
48 | |||
49 | static struct dm_arg _args[] = { | ||
50 | {0, 6, "Invalid number of feature args"}, | ||
51 | {1, UINT_MAX, "Invalid corrupt bio byte"}, | ||
52 | {0, 255, "Invalid corrupt value to write into bio byte (0-255)"}, | ||
53 | {0, UINT_MAX, "Invalid corrupt bio flags mask"}, | ||
54 | }; | ||
55 | |||
56 | /* No feature arguments supplied. */ | ||
57 | if (!as->argc) | ||
58 | return 0; | ||
59 | |||
60 | r = dm_read_arg_group(_args, as, &argc, &ti->error); | ||
61 | if (r) | ||
62 | return r; | ||
63 | |||
64 | while (argc) { | ||
65 | arg_name = dm_shift_arg(as); | ||
66 | argc--; | ||
67 | |||
68 | /* | ||
69 | * drop_writes | ||
70 | */ | ||
71 | if (!strcasecmp(arg_name, "drop_writes")) { | ||
72 | if (test_and_set_bit(DROP_WRITES, &fc->flags)) { | ||
73 | ti->error = "Feature drop_writes duplicated"; | ||
74 | return -EINVAL; | ||
75 | } | ||
76 | |||
77 | continue; | ||
78 | } | ||
79 | |||
80 | /* | ||
81 | * corrupt_bio_byte <Nth_byte> <direction> <value> <bio_flags> | ||
82 | */ | ||
83 | if (!strcasecmp(arg_name, "corrupt_bio_byte")) { | ||
84 | if (!argc) { | ||
85 | ti->error = "Feature corrupt_bio_byte requires parameters"; | ||
86 | return -EINVAL; | ||
87 | } | ||
88 | |||
89 | r = dm_read_arg(_args + 1, as, &fc->corrupt_bio_byte, &ti->error); | ||
90 | if (r) | ||
91 | return r; | ||
92 | argc--; | ||
93 | |||
94 | /* | ||
95 | * Direction r or w? | ||
96 | */ | ||
97 | arg_name = dm_shift_arg(as); | ||
98 | if (!strcasecmp(arg_name, "w")) | ||
99 | fc->corrupt_bio_rw = WRITE; | ||
100 | else if (!strcasecmp(arg_name, "r")) | ||
101 | fc->corrupt_bio_rw = READ; | ||
102 | else { | ||
103 | ti->error = "Invalid corrupt bio direction (r or w)"; | ||
104 | return -EINVAL; | ||
105 | } | ||
106 | argc--; | ||
107 | |||
108 | /* | ||
109 | * Value of byte (0-255) to write in place of correct one. | ||
110 | */ | ||
111 | r = dm_read_arg(_args + 2, as, &fc->corrupt_bio_value, &ti->error); | ||
112 | if (r) | ||
113 | return r; | ||
114 | argc--; | ||
115 | |||
116 | /* | ||
117 | * Only corrupt bios with these flags set. | ||
118 | */ | ||
119 | r = dm_read_arg(_args + 3, as, &fc->corrupt_bio_flags, &ti->error); | ||
120 | if (r) | ||
121 | return r; | ||
122 | argc--; | ||
123 | |||
124 | continue; | ||
125 | } | ||
126 | |||
127 | ti->error = "Unrecognised flakey feature requested"; | ||
128 | return -EINVAL; | ||
129 | } | ||
130 | |||
131 | if (test_bit(DROP_WRITES, &fc->flags) && (fc->corrupt_bio_rw == WRITE)) { | ||
132 | ti->error = "drop_writes is incompatible with corrupt_bio_byte with the WRITE flag set"; | ||
133 | return -EINVAL; | ||
134 | } | ||
135 | |||
136 | return 0; | ||
137 | } | ||
138 | |||
30 | /* | 139 | /* |
31 | * Construct a flakey mapping: <dev_path> <offset> <up interval> <down interval> | 140 | * Construct a flakey mapping: |
141 | * <dev_path> <offset> <up interval> <down interval> [<#feature args> [<arg>]*] | ||
142 | * | ||
143 | * Feature args: | ||
144 | * [drop_writes] | ||
145 | * [corrupt_bio_byte <Nth_byte> <direction> <value> <bio_flags>] | ||
146 | * | ||
147 | * Nth_byte starts from 1 for the first byte. | ||
148 | * Direction is r for READ or w for WRITE. | ||
149 | * bio_flags is ignored if 0. | ||
32 | */ | 150 | */ |
33 | static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv) | 151 | static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv) |
34 | { | 152 | { |
153 | static struct dm_arg _args[] = { | ||
154 | {0, UINT_MAX, "Invalid up interval"}, | ||
155 | {0, UINT_MAX, "Invalid down interval"}, | ||
156 | }; | ||
157 | |||
158 | int r; | ||
35 | struct flakey_c *fc; | 159 | struct flakey_c *fc; |
36 | unsigned long long tmp; | 160 | unsigned long long tmpll; |
161 | struct dm_arg_set as; | ||
162 | const char *devname; | ||
37 | 163 | ||
38 | if (argc != 4) { | 164 | as.argc = argc; |
39 | ti->error = "dm-flakey: Invalid argument count"; | 165 | as.argv = argv; |
166 | |||
167 | if (argc < 4) { | ||
168 | ti->error = "Invalid argument count"; | ||
40 | return -EINVAL; | 169 | return -EINVAL; |
41 | } | 170 | } |
42 | 171 | ||
43 | fc = kmalloc(sizeof(*fc), GFP_KERNEL); | 172 | fc = kzalloc(sizeof(*fc), GFP_KERNEL); |
44 | if (!fc) { | 173 | if (!fc) { |
45 | ti->error = "dm-flakey: Cannot allocate linear context"; | 174 | ti->error = "Cannot allocate linear context"; |
46 | return -ENOMEM; | 175 | return -ENOMEM; |
47 | } | 176 | } |
48 | fc->start_time = jiffies; | 177 | fc->start_time = jiffies; |
49 | 178 | ||
50 | if (sscanf(argv[1], "%llu", &tmp) != 1) { | 179 | devname = dm_shift_arg(&as); |
51 | ti->error = "dm-flakey: Invalid device sector"; | 180 | |
181 | if (sscanf(dm_shift_arg(&as), "%llu", &tmpll) != 1) { | ||
182 | ti->error = "Invalid device sector"; | ||
52 | goto bad; | 183 | goto bad; |
53 | } | 184 | } |
54 | fc->start = tmp; | 185 | fc->start = tmpll; |
55 | 186 | ||
56 | if (sscanf(argv[2], "%u", &fc->up_interval) != 1) { | 187 | r = dm_read_arg(_args, &as, &fc->up_interval, &ti->error); |
57 | ti->error = "dm-flakey: Invalid up interval"; | 188 | if (r) |
58 | goto bad; | 189 | goto bad; |
59 | } | ||
60 | 190 | ||
61 | if (sscanf(argv[3], "%u", &fc->down_interval) != 1) { | 191 | r = dm_read_arg(_args, &as, &fc->down_interval, &ti->error); |
62 | ti->error = "dm-flakey: Invalid down interval"; | 192 | if (r) |
63 | goto bad; | 193 | goto bad; |
64 | } | ||
65 | 194 | ||
66 | if (!(fc->up_interval + fc->down_interval)) { | 195 | if (!(fc->up_interval + fc->down_interval)) { |
67 | ti->error = "dm-flakey: Total (up + down) interval is zero"; | 196 | ti->error = "Total (up + down) interval is zero"; |
68 | goto bad; | 197 | goto bad; |
69 | } | 198 | } |
70 | 199 | ||
71 | if (fc->up_interval + fc->down_interval < fc->up_interval) { | 200 | if (fc->up_interval + fc->down_interval < fc->up_interval) { |
72 | ti->error = "dm-flakey: Interval overflow"; | 201 | ti->error = "Interval overflow"; |
73 | goto bad; | 202 | goto bad; |
74 | } | 203 | } |
75 | 204 | ||
76 | if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &fc->dev)) { | 205 | r = parse_features(&as, fc, ti); |
77 | ti->error = "dm-flakey: Device lookup failed"; | 206 | if (r) |
207 | goto bad; | ||
208 | |||
209 | if (dm_get_device(ti, devname, dm_table_get_mode(ti->table), &fc->dev)) { | ||
210 | ti->error = "Device lookup failed"; | ||
78 | goto bad; | 211 | goto bad; |
79 | } | 212 | } |
80 | 213 | ||
81 | ti->num_flush_requests = 1; | 214 | ti->num_flush_requests = 1; |
215 | ti->num_discard_requests = 1; | ||
82 | ti->private = fc; | 216 | ti->private = fc; |
83 | return 0; | 217 | return 0; |
84 | 218 | ||
@@ -99,7 +233,7 @@ static sector_t flakey_map_sector(struct dm_target *ti, sector_t bi_sector) | |||
99 | { | 233 | { |
100 | struct flakey_c *fc = ti->private; | 234 | struct flakey_c *fc = ti->private; |
101 | 235 | ||
102 | return fc->start + (bi_sector - ti->begin); | 236 | return fc->start + dm_target_offset(ti, bi_sector); |
103 | } | 237 | } |
104 | 238 | ||
105 | static void flakey_map_bio(struct dm_target *ti, struct bio *bio) | 239 | static void flakey_map_bio(struct dm_target *ti, struct bio *bio) |
@@ -111,6 +245,25 @@ static void flakey_map_bio(struct dm_target *ti, struct bio *bio) | |||
111 | bio->bi_sector = flakey_map_sector(ti, bio->bi_sector); | 245 | bio->bi_sector = flakey_map_sector(ti, bio->bi_sector); |
112 | } | 246 | } |
113 | 247 | ||
248 | static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc) | ||
249 | { | ||
250 | unsigned bio_bytes = bio_cur_bytes(bio); | ||
251 | char *data = bio_data(bio); | ||
252 | |||
253 | /* | ||
254 | * Overwrite the Nth byte of the data returned. | ||
255 | */ | ||
256 | if (data && bio_bytes >= fc->corrupt_bio_byte) { | ||
257 | data[fc->corrupt_bio_byte - 1] = fc->corrupt_bio_value; | ||
258 | |||
259 | DMDEBUG("Corrupting data bio=%p by writing %u to byte %u " | ||
260 | "(rw=%c bi_rw=%lu bi_sector=%llu cur_bytes=%u)\n", | ||
261 | bio, fc->corrupt_bio_value, fc->corrupt_bio_byte, | ||
262 | (bio_data_dir(bio) == WRITE) ? 'w' : 'r', | ||
263 | bio->bi_rw, (unsigned long long)bio->bi_sector, bio_bytes); | ||
264 | } | ||
265 | } | ||
266 | |||
114 | static int flakey_map(struct dm_target *ti, struct bio *bio, | 267 | static int flakey_map(struct dm_target *ti, struct bio *bio, |
115 | union map_info *map_context) | 268 | union map_info *map_context) |
116 | { | 269 | { |
@@ -119,18 +272,71 @@ static int flakey_map(struct dm_target *ti, struct bio *bio, | |||
119 | 272 | ||
120 | /* Are we alive ? */ | 273 | /* Are we alive ? */ |
121 | elapsed = (jiffies - fc->start_time) / HZ; | 274 | elapsed = (jiffies - fc->start_time) / HZ; |
122 | if (elapsed % (fc->up_interval + fc->down_interval) >= fc->up_interval) | 275 | if (elapsed % (fc->up_interval + fc->down_interval) >= fc->up_interval) { |
276 | /* | ||
277 | * Flag this bio as submitted while down. | ||
278 | */ | ||
279 | map_context->ll = 1; | ||
280 | |||
281 | /* | ||
282 | * Map reads as normal. | ||
283 | */ | ||
284 | if (bio_data_dir(bio) == READ) | ||
285 | goto map_bio; | ||
286 | |||
287 | /* | ||
288 | * Drop writes? | ||
289 | */ | ||
290 | if (test_bit(DROP_WRITES, &fc->flags)) { | ||
291 | bio_endio(bio, 0); | ||
292 | return DM_MAPIO_SUBMITTED; | ||
293 | } | ||
294 | |||
295 | /* | ||
296 | * Corrupt matching writes. | ||
297 | */ | ||
298 | if (fc->corrupt_bio_byte && (fc->corrupt_bio_rw == WRITE)) { | ||
299 | if (all_corrupt_bio_flags_match(bio, fc)) | ||
300 | corrupt_bio_data(bio, fc); | ||
301 | goto map_bio; | ||
302 | } | ||
303 | |||
304 | /* | ||
305 | * By default, error all I/O. | ||
306 | */ | ||
123 | return -EIO; | 307 | return -EIO; |
308 | } | ||
124 | 309 | ||
310 | map_bio: | ||
125 | flakey_map_bio(ti, bio); | 311 | flakey_map_bio(ti, bio); |
126 | 312 | ||
127 | return DM_MAPIO_REMAPPED; | 313 | return DM_MAPIO_REMAPPED; |
128 | } | 314 | } |
129 | 315 | ||
316 | static int flakey_end_io(struct dm_target *ti, struct bio *bio, | ||
317 | int error, union map_info *map_context) | ||
318 | { | ||
319 | struct flakey_c *fc = ti->private; | ||
320 | unsigned bio_submitted_while_down = map_context->ll; | ||
321 | |||
322 | /* | ||
323 | * Corrupt successful READs while in down state. | ||
324 | * If flags were specified, only corrupt those that match. | ||
325 | */ | ||
326 | if (!error && bio_submitted_while_down && | ||
327 | (bio_data_dir(bio) == READ) && (fc->corrupt_bio_rw == READ) && | ||
328 | all_corrupt_bio_flags_match(bio, fc)) | ||
329 | corrupt_bio_data(bio, fc); | ||
330 | |||
331 | return error; | ||
332 | } | ||
333 | |||
130 | static int flakey_status(struct dm_target *ti, status_type_t type, | 334 | static int flakey_status(struct dm_target *ti, status_type_t type, |
131 | char *result, unsigned int maxlen) | 335 | char *result, unsigned int maxlen) |
132 | { | 336 | { |
337 | unsigned sz = 0; | ||
133 | struct flakey_c *fc = ti->private; | 338 | struct flakey_c *fc = ti->private; |
339 | unsigned drop_writes; | ||
134 | 340 | ||
135 | switch (type) { | 341 | switch (type) { |
136 | case STATUSTYPE_INFO: | 342 | case STATUSTYPE_INFO: |
@@ -138,9 +344,22 @@ static int flakey_status(struct dm_target *ti, status_type_t type, | |||
138 | break; | 344 | break; |
139 | 345 | ||
140 | case STATUSTYPE_TABLE: | 346 | case STATUSTYPE_TABLE: |
141 | snprintf(result, maxlen, "%s %llu %u %u", fc->dev->name, | 347 | DMEMIT("%s %llu %u %u ", fc->dev->name, |
142 | (unsigned long long)fc->start, fc->up_interval, | 348 | (unsigned long long)fc->start, fc->up_interval, |
143 | fc->down_interval); | 349 | fc->down_interval); |
350 | |||
351 | drop_writes = test_bit(DROP_WRITES, &fc->flags); | ||
352 | DMEMIT("%u ", drop_writes + (fc->corrupt_bio_byte > 0) * 5); | ||
353 | |||
354 | if (drop_writes) | ||
355 | DMEMIT("drop_writes "); | ||
356 | |||
357 | if (fc->corrupt_bio_byte) | ||
358 | DMEMIT("corrupt_bio_byte %u %c %u %u ", | ||
359 | fc->corrupt_bio_byte, | ||
360 | (fc->corrupt_bio_rw == WRITE) ? 'w' : 'r', | ||
361 | fc->corrupt_bio_value, fc->corrupt_bio_flags); | ||
362 | |||
144 | break; | 363 | break; |
145 | } | 364 | } |
146 | return 0; | 365 | return 0; |
@@ -177,11 +396,12 @@ static int flakey_iterate_devices(struct dm_target *ti, iterate_devices_callout_ | |||
177 | 396 | ||
178 | static struct target_type flakey_target = { | 397 | static struct target_type flakey_target = { |
179 | .name = "flakey", | 398 | .name = "flakey", |
180 | .version = {1, 1, 0}, | 399 | .version = {1, 2, 0}, |
181 | .module = THIS_MODULE, | 400 | .module = THIS_MODULE, |
182 | .ctr = flakey_ctr, | 401 | .ctr = flakey_ctr, |
183 | .dtr = flakey_dtr, | 402 | .dtr = flakey_dtr, |
184 | .map = flakey_map, | 403 | .map = flakey_map, |
404 | .end_io = flakey_end_io, | ||
185 | .status = flakey_status, | 405 | .status = flakey_status, |
186 | .ioctl = flakey_ioctl, | 406 | .ioctl = flakey_ioctl, |
187 | .merge = flakey_merge, | 407 | .merge = flakey_merge, |
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index 2067288f61f..ad2eba40e31 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c | |||
@@ -38,6 +38,8 @@ struct io { | |||
38 | struct dm_io_client *client; | 38 | struct dm_io_client *client; |
39 | io_notify_fn callback; | 39 | io_notify_fn callback; |
40 | void *context; | 40 | void *context; |
41 | void *vma_invalidate_address; | ||
42 | unsigned long vma_invalidate_size; | ||
41 | } __attribute__((aligned(DM_IO_MAX_REGIONS))); | 43 | } __attribute__((aligned(DM_IO_MAX_REGIONS))); |
42 | 44 | ||
43 | static struct kmem_cache *_dm_io_cache; | 45 | static struct kmem_cache *_dm_io_cache; |
@@ -116,6 +118,10 @@ static void dec_count(struct io *io, unsigned int region, int error) | |||
116 | set_bit(region, &io->error_bits); | 118 | set_bit(region, &io->error_bits); |
117 | 119 | ||
118 | if (atomic_dec_and_test(&io->count)) { | 120 | if (atomic_dec_and_test(&io->count)) { |
121 | if (io->vma_invalidate_size) | ||
122 | invalidate_kernel_vmap_range(io->vma_invalidate_address, | ||
123 | io->vma_invalidate_size); | ||
124 | |||
119 | if (io->sleeper) | 125 | if (io->sleeper) |
120 | wake_up_process(io->sleeper); | 126 | wake_up_process(io->sleeper); |
121 | 127 | ||
@@ -159,6 +165,9 @@ struct dpages { | |||
159 | 165 | ||
160 | unsigned context_u; | 166 | unsigned context_u; |
161 | void *context_ptr; | 167 | void *context_ptr; |
168 | |||
169 | void *vma_invalidate_address; | ||
170 | unsigned long vma_invalidate_size; | ||
162 | }; | 171 | }; |
163 | 172 | ||
164 | /* | 173 | /* |
@@ -377,6 +386,9 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, | |||
377 | io->sleeper = current; | 386 | io->sleeper = current; |
378 | io->client = client; | 387 | io->client = client; |
379 | 388 | ||
389 | io->vma_invalidate_address = dp->vma_invalidate_address; | ||
390 | io->vma_invalidate_size = dp->vma_invalidate_size; | ||
391 | |||
380 | dispatch_io(rw, num_regions, where, dp, io, 1); | 392 | dispatch_io(rw, num_regions, where, dp, io, 1); |
381 | 393 | ||
382 | while (1) { | 394 | while (1) { |
@@ -415,13 +427,21 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions, | |||
415 | io->callback = fn; | 427 | io->callback = fn; |
416 | io->context = context; | 428 | io->context = context; |
417 | 429 | ||
430 | io->vma_invalidate_address = dp->vma_invalidate_address; | ||
431 | io->vma_invalidate_size = dp->vma_invalidate_size; | ||
432 | |||
418 | dispatch_io(rw, num_regions, where, dp, io, 0); | 433 | dispatch_io(rw, num_regions, where, dp, io, 0); |
419 | return 0; | 434 | return 0; |
420 | } | 435 | } |
421 | 436 | ||
422 | static int dp_init(struct dm_io_request *io_req, struct dpages *dp) | 437 | static int dp_init(struct dm_io_request *io_req, struct dpages *dp, |
438 | unsigned long size) | ||
423 | { | 439 | { |
424 | /* Set up dpages based on memory type */ | 440 | /* Set up dpages based on memory type */ |
441 | |||
442 | dp->vma_invalidate_address = NULL; | ||
443 | dp->vma_invalidate_size = 0; | ||
444 | |||
425 | switch (io_req->mem.type) { | 445 | switch (io_req->mem.type) { |
426 | case DM_IO_PAGE_LIST: | 446 | case DM_IO_PAGE_LIST: |
427 | list_dp_init(dp, io_req->mem.ptr.pl, io_req->mem.offset); | 447 | list_dp_init(dp, io_req->mem.ptr.pl, io_req->mem.offset); |
@@ -432,6 +452,11 @@ static int dp_init(struct dm_io_request *io_req, struct dpages *dp) | |||
432 | break; | 452 | break; |
433 | 453 | ||
434 | case DM_IO_VMA: | 454 | case DM_IO_VMA: |
455 | flush_kernel_vmap_range(io_req->mem.ptr.vma, size); | ||
456 | if ((io_req->bi_rw & RW_MASK) == READ) { | ||
457 | dp->vma_invalidate_address = io_req->mem.ptr.vma; | ||
458 | dp->vma_invalidate_size = size; | ||
459 | } | ||
435 | vm_dp_init(dp, io_req->mem.ptr.vma); | 460 | vm_dp_init(dp, io_req->mem.ptr.vma); |
436 | break; | 461 | break; |
437 | 462 | ||
@@ -460,7 +485,7 @@ int dm_io(struct dm_io_request *io_req, unsigned num_regions, | |||
460 | int r; | 485 | int r; |
461 | struct dpages dp; | 486 | struct dpages dp; |
462 | 487 | ||
463 | r = dp_init(io_req, &dp); | 488 | r = dp_init(io_req, &dp, (unsigned long)where->count << SECTOR_SHIFT); |
464 | if (r) | 489 | if (r) |
465 | return r; | 490 | return r; |
466 | 491 | ||
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 4cacdad2270..2e9a3ca37bd 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c | |||
@@ -128,6 +128,24 @@ static struct hash_cell *__get_uuid_cell(const char *str) | |||
128 | return NULL; | 128 | return NULL; |
129 | } | 129 | } |
130 | 130 | ||
131 | static struct hash_cell *__get_dev_cell(uint64_t dev) | ||
132 | { | ||
133 | struct mapped_device *md; | ||
134 | struct hash_cell *hc; | ||
135 | |||
136 | md = dm_get_md(huge_decode_dev(dev)); | ||
137 | if (!md) | ||
138 | return NULL; | ||
139 | |||
140 | hc = dm_get_mdptr(md); | ||
141 | if (!hc) { | ||
142 | dm_put(md); | ||
143 | return NULL; | ||
144 | } | ||
145 | |||
146 | return hc; | ||
147 | } | ||
148 | |||
131 | /*----------------------------------------------------------------- | 149 | /*----------------------------------------------------------------- |
132 | * Inserting, removing and renaming a device. | 150 | * Inserting, removing and renaming a device. |
133 | *---------------------------------------------------------------*/ | 151 | *---------------------------------------------------------------*/ |
@@ -718,25 +736,45 @@ static int dev_create(struct dm_ioctl *param, size_t param_size) | |||
718 | */ | 736 | */ |
719 | static struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param) | 737 | static struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param) |
720 | { | 738 | { |
721 | struct mapped_device *md; | 739 | struct hash_cell *hc = NULL; |
722 | void *mdptr = NULL; | ||
723 | 740 | ||
724 | if (*param->uuid) | 741 | if (*param->uuid) { |
725 | return __get_uuid_cell(param->uuid); | 742 | if (*param->name || param->dev) |
743 | return NULL; | ||
726 | 744 | ||
727 | if (*param->name) | 745 | hc = __get_uuid_cell(param->uuid); |
728 | return __get_name_cell(param->name); | 746 | if (!hc) |
747 | return NULL; | ||
748 | } else if (*param->name) { | ||
749 | if (param->dev) | ||
750 | return NULL; | ||
729 | 751 | ||
730 | md = dm_get_md(huge_decode_dev(param->dev)); | 752 | hc = __get_name_cell(param->name); |
731 | if (!md) | 753 | if (!hc) |
732 | goto out; | 754 | return NULL; |
755 | } else if (param->dev) { | ||
756 | hc = __get_dev_cell(param->dev); | ||
757 | if (!hc) | ||
758 | return NULL; | ||
759 | } else | ||
760 | return NULL; | ||
733 | 761 | ||
734 | mdptr = dm_get_mdptr(md); | 762 | /* |
735 | if (!mdptr) | 763 | * Sneakily write in both the name and the uuid |
736 | dm_put(md); | 764 | * while we have the cell. |
765 | */ | ||
766 | strlcpy(param->name, hc->name, sizeof(param->name)); | ||
767 | if (hc->uuid) | ||
768 | strlcpy(param->uuid, hc->uuid, sizeof(param->uuid)); | ||
769 | else | ||
770 | param->uuid[0] = '\0'; | ||
737 | 771 | ||
738 | out: | 772 | if (hc->new_map) |
739 | return mdptr; | 773 | param->flags |= DM_INACTIVE_PRESENT_FLAG; |
774 | else | ||
775 | param->flags &= ~DM_INACTIVE_PRESENT_FLAG; | ||
776 | |||
777 | return hc; | ||
740 | } | 778 | } |
741 | 779 | ||
742 | static struct mapped_device *find_device(struct dm_ioctl *param) | 780 | static struct mapped_device *find_device(struct dm_ioctl *param) |
@@ -746,24 +784,8 @@ static struct mapped_device *find_device(struct dm_ioctl *param) | |||
746 | 784 | ||
747 | down_read(&_hash_lock); | 785 | down_read(&_hash_lock); |
748 | hc = __find_device_hash_cell(param); | 786 | hc = __find_device_hash_cell(param); |
749 | if (hc) { | 787 | if (hc) |
750 | md = hc->md; | 788 | md = hc->md; |
751 | |||
752 | /* | ||
753 | * Sneakily write in both the name and the uuid | ||
754 | * while we have the cell. | ||
755 | */ | ||
756 | strlcpy(param->name, hc->name, sizeof(param->name)); | ||
757 | if (hc->uuid) | ||
758 | strlcpy(param->uuid, hc->uuid, sizeof(param->uuid)); | ||
759 | else | ||
760 | param->uuid[0] = '\0'; | ||
761 | |||
762 | if (hc->new_map) | ||
763 | param->flags |= DM_INACTIVE_PRESENT_FLAG; | ||
764 | else | ||
765 | param->flags &= ~DM_INACTIVE_PRESENT_FLAG; | ||
766 | } | ||
767 | up_read(&_hash_lock); | 789 | up_read(&_hash_lock); |
768 | 790 | ||
769 | return md; | 791 | return md; |
@@ -1402,6 +1424,11 @@ static int target_message(struct dm_ioctl *param, size_t param_size) | |||
1402 | goto out; | 1424 | goto out; |
1403 | } | 1425 | } |
1404 | 1426 | ||
1427 | if (!argc) { | ||
1428 | DMWARN("Empty message received."); | ||
1429 | goto out; | ||
1430 | } | ||
1431 | |||
1405 | table = dm_get_live_table(md); | 1432 | table = dm_get_live_table(md); |
1406 | if (!table) | 1433 | if (!table) |
1407 | goto out_argv; | 1434 | goto out_argv; |
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index 819e37eaaeb..32ac70861d6 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c | |||
@@ -10,7 +10,7 @@ | |||
10 | */ | 10 | */ |
11 | 11 | ||
12 | #include <linux/types.h> | 12 | #include <linux/types.h> |
13 | #include <asm/atomic.h> | 13 | #include <linux/atomic.h> |
14 | #include <linux/blkdev.h> | 14 | #include <linux/blkdev.h> |
15 | #include <linux/fs.h> | 15 | #include <linux/fs.h> |
16 | #include <linux/init.h> | 16 | #include <linux/init.h> |
@@ -224,8 +224,6 @@ struct kcopyd_job { | |||
224 | unsigned int num_dests; | 224 | unsigned int num_dests; |
225 | struct dm_io_region dests[DM_KCOPYD_MAX_REGIONS]; | 225 | struct dm_io_region dests[DM_KCOPYD_MAX_REGIONS]; |
226 | 226 | ||
227 | sector_t offset; | ||
228 | unsigned int nr_pages; | ||
229 | struct page_list *pages; | 227 | struct page_list *pages; |
230 | 228 | ||
231 | /* | 229 | /* |
@@ -380,7 +378,7 @@ static int run_io_job(struct kcopyd_job *job) | |||
380 | .bi_rw = job->rw, | 378 | .bi_rw = job->rw, |
381 | .mem.type = DM_IO_PAGE_LIST, | 379 | .mem.type = DM_IO_PAGE_LIST, |
382 | .mem.ptr.pl = job->pages, | 380 | .mem.ptr.pl = job->pages, |
383 | .mem.offset = job->offset, | 381 | .mem.offset = 0, |
384 | .notify.fn = complete_io, | 382 | .notify.fn = complete_io, |
385 | .notify.context = job, | 383 | .notify.context = job, |
386 | .client = job->kc->io_client, | 384 | .client = job->kc->io_client, |
@@ -397,10 +395,9 @@ static int run_io_job(struct kcopyd_job *job) | |||
397 | static int run_pages_job(struct kcopyd_job *job) | 395 | static int run_pages_job(struct kcopyd_job *job) |
398 | { | 396 | { |
399 | int r; | 397 | int r; |
398 | unsigned nr_pages = dm_div_up(job->dests[0].count, PAGE_SIZE >> 9); | ||
400 | 399 | ||
401 | job->nr_pages = dm_div_up(job->dests[0].count + job->offset, | 400 | r = kcopyd_get_pages(job->kc, nr_pages, &job->pages); |
402 | PAGE_SIZE >> 9); | ||
403 | r = kcopyd_get_pages(job->kc, job->nr_pages, &job->pages); | ||
404 | if (!r) { | 401 | if (!r) { |
405 | /* this job is ready for io */ | 402 | /* this job is ready for io */ |
406 | push(&job->kc->io_jobs, job); | 403 | push(&job->kc->io_jobs, job); |
@@ -602,8 +599,6 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, | |||
602 | job->num_dests = num_dests; | 599 | job->num_dests = num_dests; |
603 | memcpy(&job->dests, dests, sizeof(*dests) * num_dests); | 600 | memcpy(&job->dests, dests, sizeof(*dests) * num_dests); |
604 | 601 | ||
605 | job->offset = 0; | ||
606 | job->nr_pages = 0; | ||
607 | job->pages = NULL; | 602 | job->pages = NULL; |
608 | 603 | ||
609 | job->fn = fn; | 604 | job->fn = fn; |
@@ -622,6 +617,38 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, | |||
622 | } | 617 | } |
623 | EXPORT_SYMBOL(dm_kcopyd_copy); | 618 | EXPORT_SYMBOL(dm_kcopyd_copy); |
624 | 619 | ||
620 | void *dm_kcopyd_prepare_callback(struct dm_kcopyd_client *kc, | ||
621 | dm_kcopyd_notify_fn fn, void *context) | ||
622 | { | ||
623 | struct kcopyd_job *job; | ||
624 | |||
625 | job = mempool_alloc(kc->job_pool, GFP_NOIO); | ||
626 | |||
627 | memset(job, 0, sizeof(struct kcopyd_job)); | ||
628 | job->kc = kc; | ||
629 | job->fn = fn; | ||
630 | job->context = context; | ||
631 | job->master_job = job; | ||
632 | |||
633 | atomic_inc(&kc->nr_jobs); | ||
634 | |||
635 | return job; | ||
636 | } | ||
637 | EXPORT_SYMBOL(dm_kcopyd_prepare_callback); | ||
638 | |||
639 | void dm_kcopyd_do_callback(void *j, int read_err, unsigned long write_err) | ||
640 | { | ||
641 | struct kcopyd_job *job = j; | ||
642 | struct dm_kcopyd_client *kc = job->kc; | ||
643 | |||
644 | job->read_err = read_err; | ||
645 | job->write_err = write_err; | ||
646 | |||
647 | push(&kc->complete_jobs, job); | ||
648 | wake(kc); | ||
649 | } | ||
650 | EXPORT_SYMBOL(dm_kcopyd_do_callback); | ||
651 | |||
625 | /* | 652 | /* |
626 | * Cancels a kcopyd job, eg. someone might be deactivating a | 653 | * Cancels a kcopyd job, eg. someone might be deactivating a |
627 | * mirror. | 654 | * mirror. |
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c index aa2e0c374ab..1021c898601 100644 --- a/drivers/md/dm-log-userspace-base.c +++ b/drivers/md/dm-log-userspace-base.c | |||
@@ -394,8 +394,7 @@ static int flush_by_group(struct log_c *lc, struct list_head *flush_list) | |||
394 | group[count] = fe->region; | 394 | group[count] = fe->region; |
395 | count++; | 395 | count++; |
396 | 396 | ||
397 | list_del(&fe->list); | 397 | list_move(&fe->list, &tmp_list); |
398 | list_add(&fe->list, &tmp_list); | ||
399 | 398 | ||
400 | type = fe->type; | 399 | type = fe->type; |
401 | if (count >= MAX_FLUSH_GROUP_COUNT) | 400 | if (count >= MAX_FLUSH_GROUP_COUNT) |
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index 948e3f4925b..3b52bb72bd1 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c | |||
@@ -197,15 +197,21 @@ EXPORT_SYMBOL(dm_dirty_log_destroy); | |||
197 | #define MIRROR_DISK_VERSION 2 | 197 | #define MIRROR_DISK_VERSION 2 |
198 | #define LOG_OFFSET 2 | 198 | #define LOG_OFFSET 2 |
199 | 199 | ||
200 | struct log_header { | 200 | struct log_header_disk { |
201 | uint32_t magic; | 201 | __le32 magic; |
202 | 202 | ||
203 | /* | 203 | /* |
204 | * Simple, incrementing version. no backward | 204 | * Simple, incrementing version. no backward |
205 | * compatibility. | 205 | * compatibility. |
206 | */ | 206 | */ |
207 | __le32 version; | ||
208 | __le64 nr_regions; | ||
209 | } __packed; | ||
210 | |||
211 | struct log_header_core { | ||
212 | uint32_t magic; | ||
207 | uint32_t version; | 213 | uint32_t version; |
208 | sector_t nr_regions; | 214 | uint64_t nr_regions; |
209 | }; | 215 | }; |
210 | 216 | ||
211 | struct log_c { | 217 | struct log_c { |
@@ -239,10 +245,10 @@ struct log_c { | |||
239 | int log_dev_failed; | 245 | int log_dev_failed; |
240 | int log_dev_flush_failed; | 246 | int log_dev_flush_failed; |
241 | struct dm_dev *log_dev; | 247 | struct dm_dev *log_dev; |
242 | struct log_header header; | 248 | struct log_header_core header; |
243 | 249 | ||
244 | struct dm_io_region header_location; | 250 | struct dm_io_region header_location; |
245 | struct log_header *disk_header; | 251 | struct log_header_disk *disk_header; |
246 | }; | 252 | }; |
247 | 253 | ||
248 | /* | 254 | /* |
@@ -251,34 +257,34 @@ struct log_c { | |||
251 | */ | 257 | */ |
252 | static inline int log_test_bit(uint32_t *bs, unsigned bit) | 258 | static inline int log_test_bit(uint32_t *bs, unsigned bit) |
253 | { | 259 | { |
254 | return test_bit_le(bit, (unsigned long *) bs) ? 1 : 0; | 260 | return test_bit_le(bit, bs) ? 1 : 0; |
255 | } | 261 | } |
256 | 262 | ||
257 | static inline void log_set_bit(struct log_c *l, | 263 | static inline void log_set_bit(struct log_c *l, |
258 | uint32_t *bs, unsigned bit) | 264 | uint32_t *bs, unsigned bit) |
259 | { | 265 | { |
260 | __test_and_set_bit_le(bit, (unsigned long *) bs); | 266 | __set_bit_le(bit, bs); |
261 | l->touched_cleaned = 1; | 267 | l->touched_cleaned = 1; |
262 | } | 268 | } |
263 | 269 | ||
264 | static inline void log_clear_bit(struct log_c *l, | 270 | static inline void log_clear_bit(struct log_c *l, |
265 | uint32_t *bs, unsigned bit) | 271 | uint32_t *bs, unsigned bit) |
266 | { | 272 | { |
267 | __test_and_clear_bit_le(bit, (unsigned long *) bs); | 273 | __clear_bit_le(bit, bs); |
268 | l->touched_dirtied = 1; | 274 | l->touched_dirtied = 1; |
269 | } | 275 | } |
270 | 276 | ||
271 | /*---------------------------------------------------------------- | 277 | /*---------------------------------------------------------------- |
272 | * Header IO | 278 | * Header IO |
273 | *--------------------------------------------------------------*/ | 279 | *--------------------------------------------------------------*/ |
274 | static void header_to_disk(struct log_header *core, struct log_header *disk) | 280 | static void header_to_disk(struct log_header_core *core, struct log_header_disk *disk) |
275 | { | 281 | { |
276 | disk->magic = cpu_to_le32(core->magic); | 282 | disk->magic = cpu_to_le32(core->magic); |
277 | disk->version = cpu_to_le32(core->version); | 283 | disk->version = cpu_to_le32(core->version); |
278 | disk->nr_regions = cpu_to_le64(core->nr_regions); | 284 | disk->nr_regions = cpu_to_le64(core->nr_regions); |
279 | } | 285 | } |
280 | 286 | ||
281 | static void header_from_disk(struct log_header *core, struct log_header *disk) | 287 | static void header_from_disk(struct log_header_core *core, struct log_header_disk *disk) |
282 | { | 288 | { |
283 | core->magic = le32_to_cpu(disk->magic); | 289 | core->magic = le32_to_cpu(disk->magic); |
284 | core->version = le32_to_cpu(disk->version); | 290 | core->version = le32_to_cpu(disk->version); |
@@ -486,7 +492,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, | |||
486 | memset(lc->sync_bits, (sync == NOSYNC) ? -1 : 0, bitset_size); | 492 | memset(lc->sync_bits, (sync == NOSYNC) ? -1 : 0, bitset_size); |
487 | lc->sync_count = (sync == NOSYNC) ? region_count : 0; | 493 | lc->sync_count = (sync == NOSYNC) ? region_count : 0; |
488 | 494 | ||
489 | lc->recovering_bits = vmalloc(bitset_size); | 495 | lc->recovering_bits = vzalloc(bitset_size); |
490 | if (!lc->recovering_bits) { | 496 | if (!lc->recovering_bits) { |
491 | DMWARN("couldn't allocate sync bitset"); | 497 | DMWARN("couldn't allocate sync bitset"); |
492 | vfree(lc->sync_bits); | 498 | vfree(lc->sync_bits); |
@@ -498,7 +504,6 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, | |||
498 | kfree(lc); | 504 | kfree(lc); |
499 | return -ENOMEM; | 505 | return -ENOMEM; |
500 | } | 506 | } |
501 | memset(lc->recovering_bits, 0, bitset_size); | ||
502 | lc->sync_search = 0; | 507 | lc->sync_search = 0; |
503 | log->context = lc; | 508 | log->context = lc; |
504 | 509 | ||
@@ -739,8 +744,7 @@ static int core_get_resync_work(struct dm_dirty_log *log, region_t *region) | |||
739 | return 0; | 744 | return 0; |
740 | 745 | ||
741 | do { | 746 | do { |
742 | *region = find_next_zero_bit_le( | 747 | *region = find_next_zero_bit_le(lc->sync_bits, |
743 | (unsigned long *) lc->sync_bits, | ||
744 | lc->region_count, | 748 | lc->region_count, |
745 | lc->sync_search); | 749 | lc->sync_search); |
746 | lc->sync_search = *region + 1; | 750 | lc->sync_search = *region + 1; |
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index aa4e570c2cb..5e0090ef418 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c | |||
@@ -19,10 +19,9 @@ | |||
19 | #include <linux/time.h> | 19 | #include <linux/time.h> |
20 | #include <linux/workqueue.h> | 20 | #include <linux/workqueue.h> |
21 | #include <scsi/scsi_dh.h> | 21 | #include <scsi/scsi_dh.h> |
22 | #include <asm/atomic.h> | 22 | #include <linux/atomic.h> |
23 | 23 | ||
24 | #define DM_MSG_PREFIX "multipath" | 24 | #define DM_MSG_PREFIX "multipath" |
25 | #define MESG_STR(x) x, sizeof(x) | ||
26 | #define DM_PG_INIT_DELAY_MSECS 2000 | 25 | #define DM_PG_INIT_DELAY_MSECS 2000 |
27 | #define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1) | 26 | #define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1) |
28 | 27 | ||
@@ -505,80 +504,29 @@ static void trigger_event(struct work_struct *work) | |||
505 | * <#paths> <#per-path selector args> | 504 | * <#paths> <#per-path selector args> |
506 | * [<path> [<arg>]* ]+ ]+ | 505 | * [<path> [<arg>]* ]+ ]+ |
507 | *---------------------------------------------------------------*/ | 506 | *---------------------------------------------------------------*/ |
508 | struct param { | 507 | static int parse_path_selector(struct dm_arg_set *as, struct priority_group *pg, |
509 | unsigned min; | ||
510 | unsigned max; | ||
511 | char *error; | ||
512 | }; | ||
513 | |||
514 | static int read_param(struct param *param, char *str, unsigned *v, char **error) | ||
515 | { | ||
516 | if (!str || | ||
517 | (sscanf(str, "%u", v) != 1) || | ||
518 | (*v < param->min) || | ||
519 | (*v > param->max)) { | ||
520 | *error = param->error; | ||
521 | return -EINVAL; | ||
522 | } | ||
523 | |||
524 | return 0; | ||
525 | } | ||
526 | |||
527 | struct arg_set { | ||
528 | unsigned argc; | ||
529 | char **argv; | ||
530 | }; | ||
531 | |||
532 | static char *shift(struct arg_set *as) | ||
533 | { | ||
534 | char *r; | ||
535 | |||
536 | if (as->argc) { | ||
537 | as->argc--; | ||
538 | r = *as->argv; | ||
539 | as->argv++; | ||
540 | return r; | ||
541 | } | ||
542 | |||
543 | return NULL; | ||
544 | } | ||
545 | |||
546 | static void consume(struct arg_set *as, unsigned n) | ||
547 | { | ||
548 | BUG_ON (as->argc < n); | ||
549 | as->argc -= n; | ||
550 | as->argv += n; | ||
551 | } | ||
552 | |||
553 | static int parse_path_selector(struct arg_set *as, struct priority_group *pg, | ||
554 | struct dm_target *ti) | 508 | struct dm_target *ti) |
555 | { | 509 | { |
556 | int r; | 510 | int r; |
557 | struct path_selector_type *pst; | 511 | struct path_selector_type *pst; |
558 | unsigned ps_argc; | 512 | unsigned ps_argc; |
559 | 513 | ||
560 | static struct param _params[] = { | 514 | static struct dm_arg _args[] = { |
561 | {0, 1024, "invalid number of path selector args"}, | 515 | {0, 1024, "invalid number of path selector args"}, |
562 | }; | 516 | }; |
563 | 517 | ||
564 | pst = dm_get_path_selector(shift(as)); | 518 | pst = dm_get_path_selector(dm_shift_arg(as)); |
565 | if (!pst) { | 519 | if (!pst) { |
566 | ti->error = "unknown path selector type"; | 520 | ti->error = "unknown path selector type"; |
567 | return -EINVAL; | 521 | return -EINVAL; |
568 | } | 522 | } |
569 | 523 | ||
570 | r = read_param(_params, shift(as), &ps_argc, &ti->error); | 524 | r = dm_read_arg_group(_args, as, &ps_argc, &ti->error); |
571 | if (r) { | 525 | if (r) { |
572 | dm_put_path_selector(pst); | 526 | dm_put_path_selector(pst); |
573 | return -EINVAL; | 527 | return -EINVAL; |
574 | } | 528 | } |
575 | 529 | ||
576 | if (ps_argc > as->argc) { | ||
577 | dm_put_path_selector(pst); | ||
578 | ti->error = "not enough arguments for path selector"; | ||
579 | return -EINVAL; | ||
580 | } | ||
581 | |||
582 | r = pst->create(&pg->ps, ps_argc, as->argv); | 530 | r = pst->create(&pg->ps, ps_argc, as->argv); |
583 | if (r) { | 531 | if (r) { |
584 | dm_put_path_selector(pst); | 532 | dm_put_path_selector(pst); |
@@ -587,12 +535,12 @@ static int parse_path_selector(struct arg_set *as, struct priority_group *pg, | |||
587 | } | 535 | } |
588 | 536 | ||
589 | pg->ps.type = pst; | 537 | pg->ps.type = pst; |
590 | consume(as, ps_argc); | 538 | dm_consume_args(as, ps_argc); |
591 | 539 | ||
592 | return 0; | 540 | return 0; |
593 | } | 541 | } |
594 | 542 | ||
595 | static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps, | 543 | static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps, |
596 | struct dm_target *ti) | 544 | struct dm_target *ti) |
597 | { | 545 | { |
598 | int r; | 546 | int r; |
@@ -609,7 +557,7 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps, | |||
609 | if (!p) | 557 | if (!p) |
610 | return ERR_PTR(-ENOMEM); | 558 | return ERR_PTR(-ENOMEM); |
611 | 559 | ||
612 | r = dm_get_device(ti, shift(as), dm_table_get_mode(ti->table), | 560 | r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table), |
613 | &p->path.dev); | 561 | &p->path.dev); |
614 | if (r) { | 562 | if (r) { |
615 | ti->error = "error getting device"; | 563 | ti->error = "error getting device"; |
@@ -660,16 +608,16 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps, | |||
660 | return ERR_PTR(r); | 608 | return ERR_PTR(r); |
661 | } | 609 | } |
662 | 610 | ||
663 | static struct priority_group *parse_priority_group(struct arg_set *as, | 611 | static struct priority_group *parse_priority_group(struct dm_arg_set *as, |
664 | struct multipath *m) | 612 | struct multipath *m) |
665 | { | 613 | { |
666 | static struct param _params[] = { | 614 | static struct dm_arg _args[] = { |
667 | {1, 1024, "invalid number of paths"}, | 615 | {1, 1024, "invalid number of paths"}, |
668 | {0, 1024, "invalid number of selector args"} | 616 | {0, 1024, "invalid number of selector args"} |
669 | }; | 617 | }; |
670 | 618 | ||
671 | int r; | 619 | int r; |
672 | unsigned i, nr_selector_args, nr_params; | 620 | unsigned i, nr_selector_args, nr_args; |
673 | struct priority_group *pg; | 621 | struct priority_group *pg; |
674 | struct dm_target *ti = m->ti; | 622 | struct dm_target *ti = m->ti; |
675 | 623 | ||
@@ -693,26 +641,26 @@ static struct priority_group *parse_priority_group(struct arg_set *as, | |||
693 | /* | 641 | /* |
694 | * read the paths | 642 | * read the paths |
695 | */ | 643 | */ |
696 | r = read_param(_params, shift(as), &pg->nr_pgpaths, &ti->error); | 644 | r = dm_read_arg(_args, as, &pg->nr_pgpaths, &ti->error); |
697 | if (r) | 645 | if (r) |
698 | goto bad; | 646 | goto bad; |
699 | 647 | ||
700 | r = read_param(_params + 1, shift(as), &nr_selector_args, &ti->error); | 648 | r = dm_read_arg(_args + 1, as, &nr_selector_args, &ti->error); |
701 | if (r) | 649 | if (r) |
702 | goto bad; | 650 | goto bad; |
703 | 651 | ||
704 | nr_params = 1 + nr_selector_args; | 652 | nr_args = 1 + nr_selector_args; |
705 | for (i = 0; i < pg->nr_pgpaths; i++) { | 653 | for (i = 0; i < pg->nr_pgpaths; i++) { |
706 | struct pgpath *pgpath; | 654 | struct pgpath *pgpath; |
707 | struct arg_set path_args; | 655 | struct dm_arg_set path_args; |
708 | 656 | ||
709 | if (as->argc < nr_params) { | 657 | if (as->argc < nr_args) { |
710 | ti->error = "not enough path parameters"; | 658 | ti->error = "not enough path parameters"; |
711 | r = -EINVAL; | 659 | r = -EINVAL; |
712 | goto bad; | 660 | goto bad; |
713 | } | 661 | } |
714 | 662 | ||
715 | path_args.argc = nr_params; | 663 | path_args.argc = nr_args; |
716 | path_args.argv = as->argv; | 664 | path_args.argv = as->argv; |
717 | 665 | ||
718 | pgpath = parse_path(&path_args, &pg->ps, ti); | 666 | pgpath = parse_path(&path_args, &pg->ps, ti); |
@@ -723,7 +671,7 @@ static struct priority_group *parse_priority_group(struct arg_set *as, | |||
723 | 671 | ||
724 | pgpath->pg = pg; | 672 | pgpath->pg = pg; |
725 | list_add_tail(&pgpath->list, &pg->pgpaths); | 673 | list_add_tail(&pgpath->list, &pg->pgpaths); |
726 | consume(as, nr_params); | 674 | dm_consume_args(as, nr_args); |
727 | } | 675 | } |
728 | 676 | ||
729 | return pg; | 677 | return pg; |
@@ -733,28 +681,23 @@ static struct priority_group *parse_priority_group(struct arg_set *as, | |||
733 | return ERR_PTR(r); | 681 | return ERR_PTR(r); |
734 | } | 682 | } |
735 | 683 | ||
736 | static int parse_hw_handler(struct arg_set *as, struct multipath *m) | 684 | static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m) |
737 | { | 685 | { |
738 | unsigned hw_argc; | 686 | unsigned hw_argc; |
739 | int ret; | 687 | int ret; |
740 | struct dm_target *ti = m->ti; | 688 | struct dm_target *ti = m->ti; |
741 | 689 | ||
742 | static struct param _params[] = { | 690 | static struct dm_arg _args[] = { |
743 | {0, 1024, "invalid number of hardware handler args"}, | 691 | {0, 1024, "invalid number of hardware handler args"}, |
744 | }; | 692 | }; |
745 | 693 | ||
746 | if (read_param(_params, shift(as), &hw_argc, &ti->error)) | 694 | if (dm_read_arg_group(_args, as, &hw_argc, &ti->error)) |
747 | return -EINVAL; | 695 | return -EINVAL; |
748 | 696 | ||
749 | if (!hw_argc) | 697 | if (!hw_argc) |
750 | return 0; | 698 | return 0; |
751 | 699 | ||
752 | if (hw_argc > as->argc) { | 700 | m->hw_handler_name = kstrdup(dm_shift_arg(as), GFP_KERNEL); |
753 | ti->error = "not enough arguments for hardware handler"; | ||
754 | return -EINVAL; | ||
755 | } | ||
756 | |||
757 | m->hw_handler_name = kstrdup(shift(as), GFP_KERNEL); | ||
758 | request_module("scsi_dh_%s", m->hw_handler_name); | 701 | request_module("scsi_dh_%s", m->hw_handler_name); |
759 | if (scsi_dh_handler_exist(m->hw_handler_name) == 0) { | 702 | if (scsi_dh_handler_exist(m->hw_handler_name) == 0) { |
760 | ti->error = "unknown hardware handler type"; | 703 | ti->error = "unknown hardware handler type"; |
@@ -778,7 +721,7 @@ static int parse_hw_handler(struct arg_set *as, struct multipath *m) | |||
778 | for (i = 0, p+=j+1; i <= hw_argc - 2; i++, p+=j+1) | 721 | for (i = 0, p+=j+1; i <= hw_argc - 2; i++, p+=j+1) |
779 | j = sprintf(p, "%s", as->argv[i]); | 722 | j = sprintf(p, "%s", as->argv[i]); |
780 | } | 723 | } |
781 | consume(as, hw_argc - 1); | 724 | dm_consume_args(as, hw_argc - 1); |
782 | 725 | ||
783 | return 0; | 726 | return 0; |
784 | fail: | 727 | fail: |
@@ -787,20 +730,20 @@ fail: | |||
787 | return ret; | 730 | return ret; |
788 | } | 731 | } |
789 | 732 | ||
790 | static int parse_features(struct arg_set *as, struct multipath *m) | 733 | static int parse_features(struct dm_arg_set *as, struct multipath *m) |
791 | { | 734 | { |
792 | int r; | 735 | int r; |
793 | unsigned argc; | 736 | unsigned argc; |
794 | struct dm_target *ti = m->ti; | 737 | struct dm_target *ti = m->ti; |
795 | const char *param_name; | 738 | const char *arg_name; |
796 | 739 | ||
797 | static struct param _params[] = { | 740 | static struct dm_arg _args[] = { |
798 | {0, 5, "invalid number of feature args"}, | 741 | {0, 5, "invalid number of feature args"}, |
799 | {1, 50, "pg_init_retries must be between 1 and 50"}, | 742 | {1, 50, "pg_init_retries must be between 1 and 50"}, |
800 | {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"}, | 743 | {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"}, |
801 | }; | 744 | }; |
802 | 745 | ||
803 | r = read_param(_params, shift(as), &argc, &ti->error); | 746 | r = dm_read_arg_group(_args, as, &argc, &ti->error); |
804 | if (r) | 747 | if (r) |
805 | return -EINVAL; | 748 | return -EINVAL; |
806 | 749 | ||
@@ -808,26 +751,24 @@ static int parse_features(struct arg_set *as, struct multipath *m) | |||
808 | return 0; | 751 | return 0; |
809 | 752 | ||
810 | do { | 753 | do { |
811 | param_name = shift(as); | 754 | arg_name = dm_shift_arg(as); |
812 | argc--; | 755 | argc--; |
813 | 756 | ||
814 | if (!strnicmp(param_name, MESG_STR("queue_if_no_path"))) { | 757 | if (!strcasecmp(arg_name, "queue_if_no_path")) { |
815 | r = queue_if_no_path(m, 1, 0); | 758 | r = queue_if_no_path(m, 1, 0); |
816 | continue; | 759 | continue; |
817 | } | 760 | } |
818 | 761 | ||
819 | if (!strnicmp(param_name, MESG_STR("pg_init_retries")) && | 762 | if (!strcasecmp(arg_name, "pg_init_retries") && |
820 | (argc >= 1)) { | 763 | (argc >= 1)) { |
821 | r = read_param(_params + 1, shift(as), | 764 | r = dm_read_arg(_args + 1, as, &m->pg_init_retries, &ti->error); |
822 | &m->pg_init_retries, &ti->error); | ||
823 | argc--; | 765 | argc--; |
824 | continue; | 766 | continue; |
825 | } | 767 | } |
826 | 768 | ||
827 | if (!strnicmp(param_name, MESG_STR("pg_init_delay_msecs")) && | 769 | if (!strcasecmp(arg_name, "pg_init_delay_msecs") && |
828 | (argc >= 1)) { | 770 | (argc >= 1)) { |
829 | r = read_param(_params + 2, shift(as), | 771 | r = dm_read_arg(_args + 2, as, &m->pg_init_delay_msecs, &ti->error); |
830 | &m->pg_init_delay_msecs, &ti->error); | ||
831 | argc--; | 772 | argc--; |
832 | continue; | 773 | continue; |
833 | } | 774 | } |
@@ -842,15 +783,15 @@ static int parse_features(struct arg_set *as, struct multipath *m) | |||
842 | static int multipath_ctr(struct dm_target *ti, unsigned int argc, | 783 | static int multipath_ctr(struct dm_target *ti, unsigned int argc, |
843 | char **argv) | 784 | char **argv) |
844 | { | 785 | { |
845 | /* target parameters */ | 786 | /* target arguments */ |
846 | static struct param _params[] = { | 787 | static struct dm_arg _args[] = { |
847 | {0, 1024, "invalid number of priority groups"}, | 788 | {0, 1024, "invalid number of priority groups"}, |
848 | {0, 1024, "invalid initial priority group number"}, | 789 | {0, 1024, "invalid initial priority group number"}, |
849 | }; | 790 | }; |
850 | 791 | ||
851 | int r; | 792 | int r; |
852 | struct multipath *m; | 793 | struct multipath *m; |
853 | struct arg_set as; | 794 | struct dm_arg_set as; |
854 | unsigned pg_count = 0; | 795 | unsigned pg_count = 0; |
855 | unsigned next_pg_num; | 796 | unsigned next_pg_num; |
856 | 797 | ||
@@ -871,11 +812,11 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, | |||
871 | if (r) | 812 | if (r) |
872 | goto bad; | 813 | goto bad; |
873 | 814 | ||
874 | r = read_param(_params, shift(&as), &m->nr_priority_groups, &ti->error); | 815 | r = dm_read_arg(_args, &as, &m->nr_priority_groups, &ti->error); |
875 | if (r) | 816 | if (r) |
876 | goto bad; | 817 | goto bad; |
877 | 818 | ||
878 | r = read_param(_params + 1, shift(&as), &next_pg_num, &ti->error); | 819 | r = dm_read_arg(_args + 1, &as, &next_pg_num, &ti->error); |
879 | if (r) | 820 | if (r) |
880 | goto bad; | 821 | goto bad; |
881 | 822 | ||
@@ -1505,10 +1446,10 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) | |||
1505 | } | 1446 | } |
1506 | 1447 | ||
1507 | if (argc == 1) { | 1448 | if (argc == 1) { |
1508 | if (!strnicmp(argv[0], MESG_STR("queue_if_no_path"))) { | 1449 | if (!strcasecmp(argv[0], "queue_if_no_path")) { |
1509 | r = queue_if_no_path(m, 1, 0); | 1450 | r = queue_if_no_path(m, 1, 0); |
1510 | goto out; | 1451 | goto out; |
1511 | } else if (!strnicmp(argv[0], MESG_STR("fail_if_no_path"))) { | 1452 | } else if (!strcasecmp(argv[0], "fail_if_no_path")) { |
1512 | r = queue_if_no_path(m, 0, 0); | 1453 | r = queue_if_no_path(m, 0, 0); |
1513 | goto out; | 1454 | goto out; |
1514 | } | 1455 | } |
@@ -1519,18 +1460,18 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) | |||
1519 | goto out; | 1460 | goto out; |
1520 | } | 1461 | } |
1521 | 1462 | ||
1522 | if (!strnicmp(argv[0], MESG_STR("disable_group"))) { | 1463 | if (!strcasecmp(argv[0], "disable_group")) { |
1523 | r = bypass_pg_num(m, argv[1], 1); | 1464 | r = bypass_pg_num(m, argv[1], 1); |
1524 | goto out; | 1465 | goto out; |
1525 | } else if (!strnicmp(argv[0], MESG_STR("enable_group"))) { | 1466 | } else if (!strcasecmp(argv[0], "enable_group")) { |
1526 | r = bypass_pg_num(m, argv[1], 0); | 1467 | r = bypass_pg_num(m, argv[1], 0); |
1527 | goto out; | 1468 | goto out; |
1528 | } else if (!strnicmp(argv[0], MESG_STR("switch_group"))) { | 1469 | } else if (!strcasecmp(argv[0], "switch_group")) { |
1529 | r = switch_pg_num(m, argv[1]); | 1470 | r = switch_pg_num(m, argv[1]); |
1530 | goto out; | 1471 | goto out; |
1531 | } else if (!strnicmp(argv[0], MESG_STR("reinstate_path"))) | 1472 | } else if (!strcasecmp(argv[0], "reinstate_path")) |
1532 | action = reinstate_path; | 1473 | action = reinstate_path; |
1533 | else if (!strnicmp(argv[0], MESG_STR("fail_path"))) | 1474 | else if (!strcasecmp(argv[0], "fail_path")) |
1534 | action = fail_path; | 1475 | action = fail_path; |
1535 | else { | 1476 | else { |
1536 | DMWARN("Unrecognised multipath message received."); | 1477 | DMWARN("Unrecognised multipath message received."); |
diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c index f92b6cea9d9..03a837aa5ce 100644 --- a/drivers/md/dm-queue-length.c +++ b/drivers/md/dm-queue-length.c | |||
@@ -20,7 +20,7 @@ | |||
20 | #include <linux/ctype.h> | 20 | #include <linux/ctype.h> |
21 | #include <linux/errno.h> | 21 | #include <linux/errno.h> |
22 | #include <linux/module.h> | 22 | #include <linux/module.h> |
23 | #include <asm/atomic.h> | 23 | #include <linux/atomic.h> |
24 | 24 | ||
25 | #define DM_MSG_PREFIX "multipath queue-length" | 25 | #define DM_MSG_PREFIX "multipath queue-length" |
26 | #define QL_MIN_IO 128 | 26 | #define QL_MIN_IO 128 |
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index e5d8904fc8f..86df8b2cf92 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c | |||
@@ -8,19 +8,19 @@ | |||
8 | #include <linux/slab.h> | 8 | #include <linux/slab.h> |
9 | 9 | ||
10 | #include "md.h" | 10 | #include "md.h" |
11 | #include "raid1.h" | ||
11 | #include "raid5.h" | 12 | #include "raid5.h" |
12 | #include "dm.h" | ||
13 | #include "bitmap.h" | 13 | #include "bitmap.h" |
14 | 14 | ||
15 | #include <linux/device-mapper.h> | ||
16 | |||
15 | #define DM_MSG_PREFIX "raid" | 17 | #define DM_MSG_PREFIX "raid" |
16 | 18 | ||
17 | /* | 19 | /* |
18 | * If the MD doesn't support MD_SYNC_STATE_FORCED yet, then | 20 | * The following flags are used by dm-raid.c to set up the array state. |
19 | * make it so the flag doesn't set anything. | 21 | * They must be cleared before md_run is called. |
20 | */ | 22 | */ |
21 | #ifndef MD_SYNC_STATE_FORCED | 23 | #define FirstUse 10 /* rdev flag */ |
22 | #define MD_SYNC_STATE_FORCED 0 | ||
23 | #endif | ||
24 | 24 | ||
25 | struct raid_dev { | 25 | struct raid_dev { |
26 | /* | 26 | /* |
@@ -43,14 +43,15 @@ struct raid_dev { | |||
43 | /* | 43 | /* |
44 | * Flags for rs->print_flags field. | 44 | * Flags for rs->print_flags field. |
45 | */ | 45 | */ |
46 | #define DMPF_DAEMON_SLEEP 0x1 | 46 | #define DMPF_SYNC 0x1 |
47 | #define DMPF_MAX_WRITE_BEHIND 0x2 | 47 | #define DMPF_NOSYNC 0x2 |
48 | #define DMPF_SYNC 0x4 | 48 | #define DMPF_REBUILD 0x4 |
49 | #define DMPF_NOSYNC 0x8 | 49 | #define DMPF_DAEMON_SLEEP 0x8 |
50 | #define DMPF_STRIPE_CACHE 0x10 | 50 | #define DMPF_MIN_RECOVERY_RATE 0x10 |
51 | #define DMPF_MIN_RECOVERY_RATE 0x20 | 51 | #define DMPF_MAX_RECOVERY_RATE 0x20 |
52 | #define DMPF_MAX_RECOVERY_RATE 0x40 | 52 | #define DMPF_MAX_WRITE_BEHIND 0x40 |
53 | 53 | #define DMPF_STRIPE_CACHE 0x80 | |
54 | #define DMPF_REGION_SIZE 0X100 | ||
54 | struct raid_set { | 55 | struct raid_set { |
55 | struct dm_target *ti; | 56 | struct dm_target *ti; |
56 | 57 | ||
@@ -72,6 +73,7 @@ static struct raid_type { | |||
72 | const unsigned level; /* RAID level. */ | 73 | const unsigned level; /* RAID level. */ |
73 | const unsigned algorithm; /* RAID algorithm. */ | 74 | const unsigned algorithm; /* RAID algorithm. */ |
74 | } raid_types[] = { | 75 | } raid_types[] = { |
76 | {"raid1", "RAID1 (mirroring)", 0, 2, 1, 0 /* NONE */}, | ||
75 | {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0}, | 77 | {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0}, |
76 | {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC}, | 78 | {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC}, |
77 | {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC}, | 79 | {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC}, |
@@ -105,7 +107,8 @@ static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *ra | |||
105 | } | 107 | } |
106 | 108 | ||
107 | sectors_per_dev = ti->len; | 109 | sectors_per_dev = ti->len; |
108 | if (sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) { | 110 | if ((raid_type->level > 1) && |
111 | sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) { | ||
109 | ti->error = "Target length not divisible by number of data devices"; | 112 | ti->error = "Target length not divisible by number of data devices"; |
110 | return ERR_PTR(-EINVAL); | 113 | return ERR_PTR(-EINVAL); |
111 | } | 114 | } |
@@ -147,9 +150,16 @@ static void context_free(struct raid_set *rs) | |||
147 | { | 150 | { |
148 | int i; | 151 | int i; |
149 | 152 | ||
150 | for (i = 0; i < rs->md.raid_disks; i++) | 153 | for (i = 0; i < rs->md.raid_disks; i++) { |
154 | if (rs->dev[i].meta_dev) | ||
155 | dm_put_device(rs->ti, rs->dev[i].meta_dev); | ||
156 | if (rs->dev[i].rdev.sb_page) | ||
157 | put_page(rs->dev[i].rdev.sb_page); | ||
158 | rs->dev[i].rdev.sb_page = NULL; | ||
159 | rs->dev[i].rdev.sb_loaded = 0; | ||
151 | if (rs->dev[i].data_dev) | 160 | if (rs->dev[i].data_dev) |
152 | dm_put_device(rs->ti, rs->dev[i].data_dev); | 161 | dm_put_device(rs->ti, rs->dev[i].data_dev); |
162 | } | ||
153 | 163 | ||
154 | kfree(rs); | 164 | kfree(rs); |
155 | } | 165 | } |
@@ -159,7 +169,16 @@ static void context_free(struct raid_set *rs) | |||
159 | * <meta_dev>: meta device name or '-' if missing | 169 | * <meta_dev>: meta device name or '-' if missing |
160 | * <data_dev>: data device name or '-' if missing | 170 | * <data_dev>: data device name or '-' if missing |
161 | * | 171 | * |
162 | * This code parses those words. | 172 | * The following are permitted: |
173 | * - - | ||
174 | * - <data_dev> | ||
175 | * <meta_dev> <data_dev> | ||
176 | * | ||
177 | * The following is not allowed: | ||
178 | * <meta_dev> - | ||
179 | * | ||
180 | * This code parses those words. If there is a failure, | ||
181 | * the caller must use context_free to unwind the operations. | ||
163 | */ | 182 | */ |
164 | static int dev_parms(struct raid_set *rs, char **argv) | 183 | static int dev_parms(struct raid_set *rs, char **argv) |
165 | { | 184 | { |
@@ -182,8 +201,16 @@ static int dev_parms(struct raid_set *rs, char **argv) | |||
182 | rs->dev[i].rdev.mddev = &rs->md; | 201 | rs->dev[i].rdev.mddev = &rs->md; |
183 | 202 | ||
184 | if (strcmp(argv[0], "-")) { | 203 | if (strcmp(argv[0], "-")) { |
185 | rs->ti->error = "Metadata devices not supported"; | 204 | ret = dm_get_device(rs->ti, argv[0], |
186 | return -EINVAL; | 205 | dm_table_get_mode(rs->ti->table), |
206 | &rs->dev[i].meta_dev); | ||
207 | rs->ti->error = "RAID metadata device lookup failure"; | ||
208 | if (ret) | ||
209 | return ret; | ||
210 | |||
211 | rs->dev[i].rdev.sb_page = alloc_page(GFP_KERNEL); | ||
212 | if (!rs->dev[i].rdev.sb_page) | ||
213 | return -ENOMEM; | ||
187 | } | 214 | } |
188 | 215 | ||
189 | if (!strcmp(argv[1], "-")) { | 216 | if (!strcmp(argv[1], "-")) { |
@@ -193,6 +220,10 @@ static int dev_parms(struct raid_set *rs, char **argv) | |||
193 | return -EINVAL; | 220 | return -EINVAL; |
194 | } | 221 | } |
195 | 222 | ||
223 | rs->ti->error = "No data device supplied with metadata device"; | ||
224 | if (rs->dev[i].meta_dev) | ||
225 | return -EINVAL; | ||
226 | |||
196 | continue; | 227 | continue; |
197 | } | 228 | } |
198 | 229 | ||
@@ -204,6 +235,10 @@ static int dev_parms(struct raid_set *rs, char **argv) | |||
204 | return ret; | 235 | return ret; |
205 | } | 236 | } |
206 | 237 | ||
238 | if (rs->dev[i].meta_dev) { | ||
239 | metadata_available = 1; | ||
240 | rs->dev[i].rdev.meta_bdev = rs->dev[i].meta_dev->bdev; | ||
241 | } | ||
207 | rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev; | 242 | rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev; |
208 | list_add(&rs->dev[i].rdev.same_set, &rs->md.disks); | 243 | list_add(&rs->dev[i].rdev.same_set, &rs->md.disks); |
209 | if (!test_bit(In_sync, &rs->dev[i].rdev.flags)) | 244 | if (!test_bit(In_sync, &rs->dev[i].rdev.flags)) |
@@ -235,33 +270,109 @@ static int dev_parms(struct raid_set *rs, char **argv) | |||
235 | } | 270 | } |
236 | 271 | ||
237 | /* | 272 | /* |
273 | * validate_region_size | ||
274 | * @rs | ||
275 | * @region_size: region size in sectors. If 0, pick a size (4MiB default). | ||
276 | * | ||
277 | * Set rs->md.bitmap_info.chunksize (which really refers to 'region size'). | ||
278 | * Ensure that (ti->len/region_size < 2^21) - required by MD bitmap. | ||
279 | * | ||
280 | * Returns: 0 on success, -EINVAL on failure. | ||
281 | */ | ||
282 | static int validate_region_size(struct raid_set *rs, unsigned long region_size) | ||
283 | { | ||
284 | unsigned long min_region_size = rs->ti->len / (1 << 21); | ||
285 | |||
286 | if (!region_size) { | ||
287 | /* | ||
288 | * Choose a reasonable default. All figures in sectors. | ||
289 | */ | ||
290 | if (min_region_size > (1 << 13)) { | ||
291 | DMINFO("Choosing default region size of %lu sectors", | ||
292 | region_size); | ||
293 | region_size = min_region_size; | ||
294 | } else { | ||
295 | DMINFO("Choosing default region size of 4MiB"); | ||
296 | region_size = 1 << 13; /* sectors */ | ||
297 | } | ||
298 | } else { | ||
299 | /* | ||
300 | * Validate user-supplied value. | ||
301 | */ | ||
302 | if (region_size > rs->ti->len) { | ||
303 | rs->ti->error = "Supplied region size is too large"; | ||
304 | return -EINVAL; | ||
305 | } | ||
306 | |||
307 | if (region_size < min_region_size) { | ||
308 | DMERR("Supplied region_size (%lu sectors) below minimum (%lu)", | ||
309 | region_size, min_region_size); | ||
310 | rs->ti->error = "Supplied region size is too small"; | ||
311 | return -EINVAL; | ||
312 | } | ||
313 | |||
314 | if (!is_power_of_2(region_size)) { | ||
315 | rs->ti->error = "Region size is not a power of 2"; | ||
316 | return -EINVAL; | ||
317 | } | ||
318 | |||
319 | if (region_size < rs->md.chunk_sectors) { | ||
320 | rs->ti->error = "Region size is smaller than the chunk size"; | ||
321 | return -EINVAL; | ||
322 | } | ||
323 | } | ||
324 | |||
325 | /* | ||
326 | * Convert sectors to bytes. | ||
327 | */ | ||
328 | rs->md.bitmap_info.chunksize = (region_size << 9); | ||
329 | |||
330 | return 0; | ||
331 | } | ||
332 | |||
333 | /* | ||
238 | * Possible arguments are... | 334 | * Possible arguments are... |
239 | * RAID456: | ||
240 | * <chunk_size> [optional_args] | 335 | * <chunk_size> [optional_args] |
241 | * | 336 | * |
242 | * Optional args: | 337 | * Argument definitions |
243 | * [[no]sync] Force or prevent recovery of the entire array | 338 | * <chunk_size> The number of sectors per disk that |
339 | * will form the "stripe" | ||
340 | * [[no]sync] Force or prevent recovery of the | ||
341 | * entire array | ||
244 | * [rebuild <idx>] Rebuild the drive indicated by the index | 342 | * [rebuild <idx>] Rebuild the drive indicated by the index |
245 | * [daemon_sleep <ms>] Time between bitmap daemon work to clear bits | 343 | * [daemon_sleep <ms>] Time between bitmap daemon work to |
344 | * clear bits | ||
246 | * [min_recovery_rate <kB/sec/disk>] Throttle RAID initialization | 345 | * [min_recovery_rate <kB/sec/disk>] Throttle RAID initialization |
247 | * [max_recovery_rate <kB/sec/disk>] Throttle RAID initialization | 346 | * [max_recovery_rate <kB/sec/disk>] Throttle RAID initialization |
347 | * [write_mostly <idx>] Indicate a write mostly drive via index | ||
248 | * [max_write_behind <sectors>] See '-write-behind=' (man mdadm) | 348 | * [max_write_behind <sectors>] See '-write-behind=' (man mdadm) |
249 | * [stripe_cache <sectors>] Stripe cache size for higher RAIDs | 349 | * [stripe_cache <sectors>] Stripe cache size for higher RAIDs |
350 | * [region_size <sectors>] Defines granularity of bitmap | ||
250 | */ | 351 | */ |
251 | static int parse_raid_params(struct raid_set *rs, char **argv, | 352 | static int parse_raid_params(struct raid_set *rs, char **argv, |
252 | unsigned num_raid_params) | 353 | unsigned num_raid_params) |
253 | { | 354 | { |
254 | unsigned i, rebuild_cnt = 0; | 355 | unsigned i, rebuild_cnt = 0; |
255 | unsigned long value; | 356 | unsigned long value, region_size = 0; |
256 | char *key; | 357 | char *key; |
257 | 358 | ||
258 | /* | 359 | /* |
259 | * First, parse the in-order required arguments | 360 | * First, parse the in-order required arguments |
361 | * "chunk_size" is the only argument of this type. | ||
260 | */ | 362 | */ |
261 | if ((strict_strtoul(argv[0], 10, &value) < 0) || | 363 | if ((strict_strtoul(argv[0], 10, &value) < 0)) { |
262 | !is_power_of_2(value) || (value < 8)) { | ||
263 | rs->ti->error = "Bad chunk size"; | 364 | rs->ti->error = "Bad chunk size"; |
264 | return -EINVAL; | 365 | return -EINVAL; |
366 | } else if (rs->raid_type->level == 1) { | ||
367 | if (value) | ||
368 | DMERR("Ignoring chunk size parameter for RAID 1"); | ||
369 | value = 0; | ||
370 | } else if (!is_power_of_2(value)) { | ||
371 | rs->ti->error = "Chunk size must be a power of 2"; | ||
372 | return -EINVAL; | ||
373 | } else if (value < 8) { | ||
374 | rs->ti->error = "Chunk size value is too small"; | ||
375 | return -EINVAL; | ||
265 | } | 376 | } |
266 | 377 | ||
267 | rs->md.new_chunk_sectors = rs->md.chunk_sectors = value; | 378 | rs->md.new_chunk_sectors = rs->md.chunk_sectors = value; |
@@ -269,22 +380,39 @@ static int parse_raid_params(struct raid_set *rs, char **argv, | |||
269 | num_raid_params--; | 380 | num_raid_params--; |
270 | 381 | ||
271 | /* | 382 | /* |
272 | * Second, parse the unordered optional arguments | 383 | * We set each individual device as In_sync with a completed |
384 | * 'recovery_offset'. If there has been a device failure or | ||
385 | * replacement then one of the following cases applies: | ||
386 | * | ||
387 | * 1) User specifies 'rebuild'. | ||
388 | * - Device is reset when param is read. | ||
389 | * 2) A new device is supplied. | ||
390 | * - No matching superblock found, resets device. | ||
391 | * 3) Device failure was transient and returns on reload. | ||
392 | * - Failure noticed, resets device for bitmap replay. | ||
393 | * 4) Device hadn't completed recovery after previous failure. | ||
394 | * - Superblock is read and overrides recovery_offset. | ||
395 | * | ||
396 | * What is found in the superblocks of the devices is always | ||
397 | * authoritative, unless 'rebuild' or '[no]sync' was specified. | ||
273 | */ | 398 | */ |
274 | for (i = 0; i < rs->md.raid_disks; i++) | 399 | for (i = 0; i < rs->md.raid_disks; i++) { |
275 | set_bit(In_sync, &rs->dev[i].rdev.flags); | 400 | set_bit(In_sync, &rs->dev[i].rdev.flags); |
401 | rs->dev[i].rdev.recovery_offset = MaxSector; | ||
402 | } | ||
276 | 403 | ||
404 | /* | ||
405 | * Second, parse the unordered optional arguments | ||
406 | */ | ||
277 | for (i = 0; i < num_raid_params; i++) { | 407 | for (i = 0; i < num_raid_params; i++) { |
278 | if (!strcmp(argv[i], "nosync")) { | 408 | if (!strcasecmp(argv[i], "nosync")) { |
279 | rs->md.recovery_cp = MaxSector; | 409 | rs->md.recovery_cp = MaxSector; |
280 | rs->print_flags |= DMPF_NOSYNC; | 410 | rs->print_flags |= DMPF_NOSYNC; |
281 | rs->md.flags |= MD_SYNC_STATE_FORCED; | ||
282 | continue; | 411 | continue; |
283 | } | 412 | } |
284 | if (!strcmp(argv[i], "sync")) { | 413 | if (!strcasecmp(argv[i], "sync")) { |
285 | rs->md.recovery_cp = 0; | 414 | rs->md.recovery_cp = 0; |
286 | rs->print_flags |= DMPF_SYNC; | 415 | rs->print_flags |= DMPF_SYNC; |
287 | rs->md.flags |= MD_SYNC_STATE_FORCED; | ||
288 | continue; | 416 | continue; |
289 | } | 417 | } |
290 | 418 | ||
@@ -300,9 +428,13 @@ static int parse_raid_params(struct raid_set *rs, char **argv, | |||
300 | return -EINVAL; | 428 | return -EINVAL; |
301 | } | 429 | } |
302 | 430 | ||
303 | if (!strcmp(key, "rebuild")) { | 431 | if (!strcasecmp(key, "rebuild")) { |
304 | if (++rebuild_cnt > rs->raid_type->parity_devs) { | 432 | rebuild_cnt++; |
305 | rs->ti->error = "Too many rebuild drives given"; | 433 | if (((rs->raid_type->level != 1) && |
434 | (rebuild_cnt > rs->raid_type->parity_devs)) || | ||
435 | ((rs->raid_type->level == 1) && | ||
436 | (rebuild_cnt > (rs->md.raid_disks - 1)))) { | ||
437 | rs->ti->error = "Too many rebuild devices specified for given RAID type"; | ||
306 | return -EINVAL; | 438 | return -EINVAL; |
307 | } | 439 | } |
308 | if (value > rs->md.raid_disks) { | 440 | if (value > rs->md.raid_disks) { |
@@ -311,7 +443,22 @@ static int parse_raid_params(struct raid_set *rs, char **argv, | |||
311 | } | 443 | } |
312 | clear_bit(In_sync, &rs->dev[value].rdev.flags); | 444 | clear_bit(In_sync, &rs->dev[value].rdev.flags); |
313 | rs->dev[value].rdev.recovery_offset = 0; | 445 | rs->dev[value].rdev.recovery_offset = 0; |
314 | } else if (!strcmp(key, "max_write_behind")) { | 446 | rs->print_flags |= DMPF_REBUILD; |
447 | } else if (!strcasecmp(key, "write_mostly")) { | ||
448 | if (rs->raid_type->level != 1) { | ||
449 | rs->ti->error = "write_mostly option is only valid for RAID1"; | ||
450 | return -EINVAL; | ||
451 | } | ||
452 | if (value >= rs->md.raid_disks) { | ||
453 | rs->ti->error = "Invalid write_mostly drive index given"; | ||
454 | return -EINVAL; | ||
455 | } | ||
456 | set_bit(WriteMostly, &rs->dev[value].rdev.flags); | ||
457 | } else if (!strcasecmp(key, "max_write_behind")) { | ||
458 | if (rs->raid_type->level != 1) { | ||
459 | rs->ti->error = "max_write_behind option is only valid for RAID1"; | ||
460 | return -EINVAL; | ||
461 | } | ||
315 | rs->print_flags |= DMPF_MAX_WRITE_BEHIND; | 462 | rs->print_flags |= DMPF_MAX_WRITE_BEHIND; |
316 | 463 | ||
317 | /* | 464 | /* |
@@ -324,14 +471,14 @@ static int parse_raid_params(struct raid_set *rs, char **argv, | |||
324 | return -EINVAL; | 471 | return -EINVAL; |
325 | } | 472 | } |
326 | rs->md.bitmap_info.max_write_behind = value; | 473 | rs->md.bitmap_info.max_write_behind = value; |
327 | } else if (!strcmp(key, "daemon_sleep")) { | 474 | } else if (!strcasecmp(key, "daemon_sleep")) { |
328 | rs->print_flags |= DMPF_DAEMON_SLEEP; | 475 | rs->print_flags |= DMPF_DAEMON_SLEEP; |
329 | if (!value || (value > MAX_SCHEDULE_TIMEOUT)) { | 476 | if (!value || (value > MAX_SCHEDULE_TIMEOUT)) { |
330 | rs->ti->error = "daemon sleep period out of range"; | 477 | rs->ti->error = "daemon sleep period out of range"; |
331 | return -EINVAL; | 478 | return -EINVAL; |
332 | } | 479 | } |
333 | rs->md.bitmap_info.daemon_sleep = value; | 480 | rs->md.bitmap_info.daemon_sleep = value; |
334 | } else if (!strcmp(key, "stripe_cache")) { | 481 | } else if (!strcasecmp(key, "stripe_cache")) { |
335 | rs->print_flags |= DMPF_STRIPE_CACHE; | 482 | rs->print_flags |= DMPF_STRIPE_CACHE; |
336 | 483 | ||
337 | /* | 484 | /* |
@@ -348,20 +495,23 @@ static int parse_raid_params(struct raid_set *rs, char **argv, | |||
348 | rs->ti->error = "Bad stripe_cache size"; | 495 | rs->ti->error = "Bad stripe_cache size"; |
349 | return -EINVAL; | 496 | return -EINVAL; |
350 | } | 497 | } |
351 | } else if (!strcmp(key, "min_recovery_rate")) { | 498 | } else if (!strcasecmp(key, "min_recovery_rate")) { |
352 | rs->print_flags |= DMPF_MIN_RECOVERY_RATE; | 499 | rs->print_flags |= DMPF_MIN_RECOVERY_RATE; |
353 | if (value > INT_MAX) { | 500 | if (value > INT_MAX) { |
354 | rs->ti->error = "min_recovery_rate out of range"; | 501 | rs->ti->error = "min_recovery_rate out of range"; |
355 | return -EINVAL; | 502 | return -EINVAL; |
356 | } | 503 | } |
357 | rs->md.sync_speed_min = (int)value; | 504 | rs->md.sync_speed_min = (int)value; |
358 | } else if (!strcmp(key, "max_recovery_rate")) { | 505 | } else if (!strcasecmp(key, "max_recovery_rate")) { |
359 | rs->print_flags |= DMPF_MAX_RECOVERY_RATE; | 506 | rs->print_flags |= DMPF_MAX_RECOVERY_RATE; |
360 | if (value > INT_MAX) { | 507 | if (value > INT_MAX) { |
361 | rs->ti->error = "max_recovery_rate out of range"; | 508 | rs->ti->error = "max_recovery_rate out of range"; |
362 | return -EINVAL; | 509 | return -EINVAL; |
363 | } | 510 | } |
364 | rs->md.sync_speed_max = (int)value; | 511 | rs->md.sync_speed_max = (int)value; |
512 | } else if (!strcasecmp(key, "region_size")) { | ||
513 | rs->print_flags |= DMPF_REGION_SIZE; | ||
514 | region_size = value; | ||
365 | } else { | 515 | } else { |
366 | DMERR("Unable to parse RAID parameter: %s", key); | 516 | DMERR("Unable to parse RAID parameter: %s", key); |
367 | rs->ti->error = "Unable to parse RAID parameters"; | 517 | rs->ti->error = "Unable to parse RAID parameters"; |
@@ -369,6 +519,19 @@ static int parse_raid_params(struct raid_set *rs, char **argv, | |||
369 | } | 519 | } |
370 | } | 520 | } |
371 | 521 | ||
522 | if (validate_region_size(rs, region_size)) | ||
523 | return -EINVAL; | ||
524 | |||
525 | if (rs->md.chunk_sectors) | ||
526 | rs->ti->split_io = rs->md.chunk_sectors; | ||
527 | else | ||
528 | rs->ti->split_io = region_size; | ||
529 | |||
530 | if (rs->md.chunk_sectors) | ||
531 | rs->ti->split_io = rs->md.chunk_sectors; | ||
532 | else | ||
533 | rs->ti->split_io = region_size; | ||
534 | |||
372 | /* Assume there are no metadata devices until the drives are parsed */ | 535 | /* Assume there are no metadata devices until the drives are parsed */ |
373 | rs->md.persistent = 0; | 536 | rs->md.persistent = 0; |
374 | rs->md.external = 1; | 537 | rs->md.external = 1; |
@@ -387,17 +550,351 @@ static int raid_is_congested(struct dm_target_callbacks *cb, int bits) | |||
387 | { | 550 | { |
388 | struct raid_set *rs = container_of(cb, struct raid_set, callbacks); | 551 | struct raid_set *rs = container_of(cb, struct raid_set, callbacks); |
389 | 552 | ||
553 | if (rs->raid_type->level == 1) | ||
554 | return md_raid1_congested(&rs->md, bits); | ||
555 | |||
390 | return md_raid5_congested(&rs->md, bits); | 556 | return md_raid5_congested(&rs->md, bits); |
391 | } | 557 | } |
392 | 558 | ||
393 | /* | 559 | /* |
560 | * This structure is never routinely used by userspace, unlike md superblocks. | ||
561 | * Devices with this superblock should only ever be accessed via device-mapper. | ||
562 | */ | ||
563 | #define DM_RAID_MAGIC 0x64526D44 | ||
564 | struct dm_raid_superblock { | ||
565 | __le32 magic; /* "DmRd" */ | ||
566 | __le32 features; /* Used to indicate possible future changes */ | ||
567 | |||
568 | __le32 num_devices; /* Number of devices in this array. (Max 64) */ | ||
569 | __le32 array_position; /* The position of this drive in the array */ | ||
570 | |||
571 | __le64 events; /* Incremented by md when superblock updated */ | ||
572 | __le64 failed_devices; /* Bit field of devices to indicate failures */ | ||
573 | |||
574 | /* | ||
575 | * This offset tracks the progress of the repair or replacement of | ||
576 | * an individual drive. | ||
577 | */ | ||
578 | __le64 disk_recovery_offset; | ||
579 | |||
580 | /* | ||
581 | * This offset tracks the progress of the initial array | ||
582 | * synchronisation/parity calculation. | ||
583 | */ | ||
584 | __le64 array_resync_offset; | ||
585 | |||
586 | /* | ||
587 | * RAID characteristics | ||
588 | */ | ||
589 | __le32 level; | ||
590 | __le32 layout; | ||
591 | __le32 stripe_sectors; | ||
592 | |||
593 | __u8 pad[452]; /* Round struct to 512 bytes. */ | ||
594 | /* Always set to 0 when writing. */ | ||
595 | } __packed; | ||
596 | |||
597 | static int read_disk_sb(mdk_rdev_t *rdev, int size) | ||
598 | { | ||
599 | BUG_ON(!rdev->sb_page); | ||
600 | |||
601 | if (rdev->sb_loaded) | ||
602 | return 0; | ||
603 | |||
604 | if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) { | ||
605 | DMERR("Failed to read device superblock"); | ||
606 | return -EINVAL; | ||
607 | } | ||
608 | |||
609 | rdev->sb_loaded = 1; | ||
610 | |||
611 | return 0; | ||
612 | } | ||
613 | |||
614 | static void super_sync(mddev_t *mddev, mdk_rdev_t *rdev) | ||
615 | { | ||
616 | mdk_rdev_t *r, *t; | ||
617 | uint64_t failed_devices; | ||
618 | struct dm_raid_superblock *sb; | ||
619 | |||
620 | sb = page_address(rdev->sb_page); | ||
621 | failed_devices = le64_to_cpu(sb->failed_devices); | ||
622 | |||
623 | rdev_for_each(r, t, mddev) | ||
624 | if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags)) | ||
625 | failed_devices |= (1ULL << r->raid_disk); | ||
626 | |||
627 | memset(sb, 0, sizeof(*sb)); | ||
628 | |||
629 | sb->magic = cpu_to_le32(DM_RAID_MAGIC); | ||
630 | sb->features = cpu_to_le32(0); /* No features yet */ | ||
631 | |||
632 | sb->num_devices = cpu_to_le32(mddev->raid_disks); | ||
633 | sb->array_position = cpu_to_le32(rdev->raid_disk); | ||
634 | |||
635 | sb->events = cpu_to_le64(mddev->events); | ||
636 | sb->failed_devices = cpu_to_le64(failed_devices); | ||
637 | |||
638 | sb->disk_recovery_offset = cpu_to_le64(rdev->recovery_offset); | ||
639 | sb->array_resync_offset = cpu_to_le64(mddev->recovery_cp); | ||
640 | |||
641 | sb->level = cpu_to_le32(mddev->level); | ||
642 | sb->layout = cpu_to_le32(mddev->layout); | ||
643 | sb->stripe_sectors = cpu_to_le32(mddev->chunk_sectors); | ||
644 | } | ||
645 | |||
646 | /* | ||
647 | * super_load | ||
648 | * | ||
649 | * This function creates a superblock if one is not found on the device | ||
650 | * and will decide which superblock to use if there's a choice. | ||
651 | * | ||
652 | * Return: 1 if use rdev, 0 if use refdev, -Exxx otherwise | ||
653 | */ | ||
654 | static int super_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev) | ||
655 | { | ||
656 | int ret; | ||
657 | struct dm_raid_superblock *sb; | ||
658 | struct dm_raid_superblock *refsb; | ||
659 | uint64_t events_sb, events_refsb; | ||
660 | |||
661 | rdev->sb_start = 0; | ||
662 | rdev->sb_size = sizeof(*sb); | ||
663 | |||
664 | ret = read_disk_sb(rdev, rdev->sb_size); | ||
665 | if (ret) | ||
666 | return ret; | ||
667 | |||
668 | sb = page_address(rdev->sb_page); | ||
669 | if (sb->magic != cpu_to_le32(DM_RAID_MAGIC)) { | ||
670 | super_sync(rdev->mddev, rdev); | ||
671 | |||
672 | set_bit(FirstUse, &rdev->flags); | ||
673 | |||
674 | /* Force writing of superblocks to disk */ | ||
675 | set_bit(MD_CHANGE_DEVS, &rdev->mddev->flags); | ||
676 | |||
677 | /* Any superblock is better than none, choose that if given */ | ||
678 | return refdev ? 0 : 1; | ||
679 | } | ||
680 | |||
681 | if (!refdev) | ||
682 | return 1; | ||
683 | |||
684 | events_sb = le64_to_cpu(sb->events); | ||
685 | |||
686 | refsb = page_address(refdev->sb_page); | ||
687 | events_refsb = le64_to_cpu(refsb->events); | ||
688 | |||
689 | return (events_sb > events_refsb) ? 1 : 0; | ||
690 | } | ||
691 | |||
692 | static int super_init_validation(mddev_t *mddev, mdk_rdev_t *rdev) | ||
693 | { | ||
694 | int role; | ||
695 | struct raid_set *rs = container_of(mddev, struct raid_set, md); | ||
696 | uint64_t events_sb; | ||
697 | uint64_t failed_devices; | ||
698 | struct dm_raid_superblock *sb; | ||
699 | uint32_t new_devs = 0; | ||
700 | uint32_t rebuilds = 0; | ||
701 | mdk_rdev_t *r, *t; | ||
702 | struct dm_raid_superblock *sb2; | ||
703 | |||
704 | sb = page_address(rdev->sb_page); | ||
705 | events_sb = le64_to_cpu(sb->events); | ||
706 | failed_devices = le64_to_cpu(sb->failed_devices); | ||
707 | |||
708 | /* | ||
709 | * Initialise to 1 if this is a new superblock. | ||
710 | */ | ||
711 | mddev->events = events_sb ? : 1; | ||
712 | |||
713 | /* | ||
714 | * Reshaping is not currently allowed | ||
715 | */ | ||
716 | if ((le32_to_cpu(sb->level) != mddev->level) || | ||
717 | (le32_to_cpu(sb->layout) != mddev->layout) || | ||
718 | (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors)) { | ||
719 | DMERR("Reshaping arrays not yet supported."); | ||
720 | return -EINVAL; | ||
721 | } | ||
722 | |||
723 | /* We can only change the number of devices in RAID1 right now */ | ||
724 | if ((rs->raid_type->level != 1) && | ||
725 | (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) { | ||
726 | DMERR("Reshaping arrays not yet supported."); | ||
727 | return -EINVAL; | ||
728 | } | ||
729 | |||
730 | if (!(rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC))) | ||
731 | mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset); | ||
732 | |||
733 | /* | ||
734 | * During load, we set FirstUse if a new superblock was written. | ||
735 | * There are two reasons we might not have a superblock: | ||
736 | * 1) The array is brand new - in which case, all of the | ||
737 | * devices must have their In_sync bit set. Also, | ||
738 | * recovery_cp must be 0, unless forced. | ||
739 | * 2) This is a new device being added to an old array | ||
740 | * and the new device needs to be rebuilt - in which | ||
741 | * case the In_sync bit will /not/ be set and | ||
742 | * recovery_cp must be MaxSector. | ||
743 | */ | ||
744 | rdev_for_each(r, t, mddev) { | ||
745 | if (!test_bit(In_sync, &r->flags)) { | ||
746 | if (!test_bit(FirstUse, &r->flags)) | ||
747 | DMERR("Superblock area of " | ||
748 | "rebuild device %d should have been " | ||
749 | "cleared.", r->raid_disk); | ||
750 | set_bit(FirstUse, &r->flags); | ||
751 | rebuilds++; | ||
752 | } else if (test_bit(FirstUse, &r->flags)) | ||
753 | new_devs++; | ||
754 | } | ||
755 | |||
756 | if (!rebuilds) { | ||
757 | if (new_devs == mddev->raid_disks) { | ||
758 | DMINFO("Superblocks created for new array"); | ||
759 | set_bit(MD_ARRAY_FIRST_USE, &mddev->flags); | ||
760 | } else if (new_devs) { | ||
761 | DMERR("New device injected " | ||
762 | "into existing array without 'rebuild' " | ||
763 | "parameter specified"); | ||
764 | return -EINVAL; | ||
765 | } | ||
766 | } else if (new_devs) { | ||
767 | DMERR("'rebuild' devices cannot be " | ||
768 | "injected into an array with other first-time devices"); | ||
769 | return -EINVAL; | ||
770 | } else if (mddev->recovery_cp != MaxSector) { | ||
771 | DMERR("'rebuild' specified while array is not in-sync"); | ||
772 | return -EINVAL; | ||
773 | } | ||
774 | |||
775 | /* | ||
776 | * Now we set the Faulty bit for those devices that are | ||
777 | * recorded in the superblock as failed. | ||
778 | */ | ||
779 | rdev_for_each(r, t, mddev) { | ||
780 | if (!r->sb_page) | ||
781 | continue; | ||
782 | sb2 = page_address(r->sb_page); | ||
783 | sb2->failed_devices = 0; | ||
784 | |||
785 | /* | ||
786 | * Check for any device re-ordering. | ||
787 | */ | ||
788 | if (!test_bit(FirstUse, &r->flags) && (r->raid_disk >= 0)) { | ||
789 | role = le32_to_cpu(sb2->array_position); | ||
790 | if (role != r->raid_disk) { | ||
791 | if (rs->raid_type->level != 1) { | ||
792 | rs->ti->error = "Cannot change device " | ||
793 | "positions in RAID array"; | ||
794 | return -EINVAL; | ||
795 | } | ||
796 | DMINFO("RAID1 device #%d now at position #%d", | ||
797 | role, r->raid_disk); | ||
798 | } | ||
799 | |||
800 | /* | ||
801 | * Partial recovery is performed on | ||
802 | * returning failed devices. | ||
803 | */ | ||
804 | if (failed_devices & (1 << role)) | ||
805 | set_bit(Faulty, &r->flags); | ||
806 | } | ||
807 | } | ||
808 | |||
809 | return 0; | ||
810 | } | ||
811 | |||
812 | static int super_validate(mddev_t *mddev, mdk_rdev_t *rdev) | ||
813 | { | ||
814 | struct dm_raid_superblock *sb = page_address(rdev->sb_page); | ||
815 | |||
816 | /* | ||
817 | * If mddev->events is not set, we know we have not yet initialized | ||
818 | * the array. | ||
819 | */ | ||
820 | if (!mddev->events && super_init_validation(mddev, rdev)) | ||
821 | return -EINVAL; | ||
822 | |||
823 | mddev->bitmap_info.offset = 4096 >> 9; /* Enable bitmap creation */ | ||
824 | rdev->mddev->bitmap_info.default_offset = 4096 >> 9; | ||
825 | if (!test_bit(FirstUse, &rdev->flags)) { | ||
826 | rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset); | ||
827 | if (rdev->recovery_offset != MaxSector) | ||
828 | clear_bit(In_sync, &rdev->flags); | ||
829 | } | ||
830 | |||
831 | /* | ||
832 | * If a device comes back, set it as not In_sync and no longer faulty. | ||
833 | */ | ||
834 | if (test_bit(Faulty, &rdev->flags)) { | ||
835 | clear_bit(Faulty, &rdev->flags); | ||
836 | clear_bit(In_sync, &rdev->flags); | ||
837 | rdev->saved_raid_disk = rdev->raid_disk; | ||
838 | rdev->recovery_offset = 0; | ||
839 | } | ||
840 | |||
841 | clear_bit(FirstUse, &rdev->flags); | ||
842 | |||
843 | return 0; | ||
844 | } | ||
845 | |||
846 | /* | ||
847 | * Analyse superblocks and select the freshest. | ||
848 | */ | ||
849 | static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) | ||
850 | { | ||
851 | int ret; | ||
852 | mdk_rdev_t *rdev, *freshest, *tmp; | ||
853 | mddev_t *mddev = &rs->md; | ||
854 | |||
855 | freshest = NULL; | ||
856 | rdev_for_each(rdev, tmp, mddev) { | ||
857 | if (!rdev->meta_bdev) | ||
858 | continue; | ||
859 | |||
860 | ret = super_load(rdev, freshest); | ||
861 | |||
862 | switch (ret) { | ||
863 | case 1: | ||
864 | freshest = rdev; | ||
865 | break; | ||
866 | case 0: | ||
867 | break; | ||
868 | default: | ||
869 | ti->error = "Failed to load superblock"; | ||
870 | return ret; | ||
871 | } | ||
872 | } | ||
873 | |||
874 | if (!freshest) | ||
875 | return 0; | ||
876 | |||
877 | /* | ||
878 | * Validation of the freshest device provides the source of | ||
879 | * validation for the remaining devices. | ||
880 | */ | ||
881 | ti->error = "Unable to assemble array: Invalid superblocks"; | ||
882 | if (super_validate(mddev, freshest)) | ||
883 | return -EINVAL; | ||
884 | |||
885 | rdev_for_each(rdev, tmp, mddev) | ||
886 | if ((rdev != freshest) && super_validate(mddev, rdev)) | ||
887 | return -EINVAL; | ||
888 | |||
889 | return 0; | ||
890 | } | ||
891 | |||
892 | /* | ||
394 | * Construct a RAID4/5/6 mapping: | 893 | * Construct a RAID4/5/6 mapping: |
395 | * Args: | 894 | * Args: |
396 | * <raid_type> <#raid_params> <raid_params> \ | 895 | * <raid_type> <#raid_params> <raid_params> \ |
397 | * <#raid_devs> { <meta_dev1> <dev1> .. <meta_devN> <devN> } | 896 | * <#raid_devs> { <meta_dev1> <dev1> .. <meta_devN> <devN> } |
398 | * | 897 | * |
399 | * ** metadata devices are not supported yet, use '-' instead ** | ||
400 | * | ||
401 | * <raid_params> varies by <raid_type>. See 'parse_raid_params' for | 898 | * <raid_params> varies by <raid_type>. See 'parse_raid_params' for |
402 | * details on possible <raid_params>. | 899 | * details on possible <raid_params>. |
403 | */ | 900 | */ |
@@ -465,8 +962,12 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
465 | if (ret) | 962 | if (ret) |
466 | goto bad; | 963 | goto bad; |
467 | 964 | ||
965 | rs->md.sync_super = super_sync; | ||
966 | ret = analyse_superblocks(ti, rs); | ||
967 | if (ret) | ||
968 | goto bad; | ||
969 | |||
468 | INIT_WORK(&rs->md.event_work, do_table_event); | 970 | INIT_WORK(&rs->md.event_work, do_table_event); |
469 | ti->split_io = rs->md.chunk_sectors; | ||
470 | ti->private = rs; | 971 | ti->private = rs; |
471 | 972 | ||
472 | mutex_lock(&rs->md.reconfig_mutex); | 973 | mutex_lock(&rs->md.reconfig_mutex); |
@@ -482,6 +983,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
482 | rs->callbacks.congested_fn = raid_is_congested; | 983 | rs->callbacks.congested_fn = raid_is_congested; |
483 | dm_table_add_target_callbacks(ti->table, &rs->callbacks); | 984 | dm_table_add_target_callbacks(ti->table, &rs->callbacks); |
484 | 985 | ||
986 | mddev_suspend(&rs->md); | ||
485 | return 0; | 987 | return 0; |
486 | 988 | ||
487 | bad: | 989 | bad: |
@@ -546,12 +1048,17 @@ static int raid_status(struct dm_target *ti, status_type_t type, | |||
546 | break; | 1048 | break; |
547 | case STATUSTYPE_TABLE: | 1049 | case STATUSTYPE_TABLE: |
548 | /* The string you would use to construct this array */ | 1050 | /* The string you would use to construct this array */ |
549 | for (i = 0; i < rs->md.raid_disks; i++) | 1051 | for (i = 0; i < rs->md.raid_disks; i++) { |
550 | if (rs->dev[i].data_dev && | 1052 | if ((rs->print_flags & DMPF_REBUILD) && |
1053 | rs->dev[i].data_dev && | ||
551 | !test_bit(In_sync, &rs->dev[i].rdev.flags)) | 1054 | !test_bit(In_sync, &rs->dev[i].rdev.flags)) |
552 | raid_param_cnt++; /* for rebuilds */ | 1055 | raid_param_cnt += 2; /* for rebuilds */ |
1056 | if (rs->dev[i].data_dev && | ||
1057 | test_bit(WriteMostly, &rs->dev[i].rdev.flags)) | ||
1058 | raid_param_cnt += 2; | ||
1059 | } | ||
553 | 1060 | ||
554 | raid_param_cnt += (hweight64(rs->print_flags) * 2); | 1061 | raid_param_cnt += (hweight64(rs->print_flags & ~DMPF_REBUILD) * 2); |
555 | if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC)) | 1062 | if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC)) |
556 | raid_param_cnt--; | 1063 | raid_param_cnt--; |
557 | 1064 | ||
@@ -565,7 +1072,8 @@ static int raid_status(struct dm_target *ti, status_type_t type, | |||
565 | DMEMIT(" nosync"); | 1072 | DMEMIT(" nosync"); |
566 | 1073 | ||
567 | for (i = 0; i < rs->md.raid_disks; i++) | 1074 | for (i = 0; i < rs->md.raid_disks; i++) |
568 | if (rs->dev[i].data_dev && | 1075 | if ((rs->print_flags & DMPF_REBUILD) && |
1076 | rs->dev[i].data_dev && | ||
569 | !test_bit(In_sync, &rs->dev[i].rdev.flags)) | 1077 | !test_bit(In_sync, &rs->dev[i].rdev.flags)) |
570 | DMEMIT(" rebuild %u", i); | 1078 | DMEMIT(" rebuild %u", i); |
571 | 1079 | ||
@@ -579,6 +1087,11 @@ static int raid_status(struct dm_target *ti, status_type_t type, | |||
579 | if (rs->print_flags & DMPF_MAX_RECOVERY_RATE) | 1087 | if (rs->print_flags & DMPF_MAX_RECOVERY_RATE) |
580 | DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max); | 1088 | DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max); |
581 | 1089 | ||
1090 | for (i = 0; i < rs->md.raid_disks; i++) | ||
1091 | if (rs->dev[i].data_dev && | ||
1092 | test_bit(WriteMostly, &rs->dev[i].rdev.flags)) | ||
1093 | DMEMIT(" write_mostly %u", i); | ||
1094 | |||
582 | if (rs->print_flags & DMPF_MAX_WRITE_BEHIND) | 1095 | if (rs->print_flags & DMPF_MAX_WRITE_BEHIND) |
583 | DMEMIT(" max_write_behind %lu", | 1096 | DMEMIT(" max_write_behind %lu", |
584 | rs->md.bitmap_info.max_write_behind); | 1097 | rs->md.bitmap_info.max_write_behind); |
@@ -591,9 +1104,16 @@ static int raid_status(struct dm_target *ti, status_type_t type, | |||
591 | conf ? conf->max_nr_stripes * 2 : 0); | 1104 | conf ? conf->max_nr_stripes * 2 : 0); |
592 | } | 1105 | } |
593 | 1106 | ||
1107 | if (rs->print_flags & DMPF_REGION_SIZE) | ||
1108 | DMEMIT(" region_size %lu", | ||
1109 | rs->md.bitmap_info.chunksize >> 9); | ||
1110 | |||
594 | DMEMIT(" %d", rs->md.raid_disks); | 1111 | DMEMIT(" %d", rs->md.raid_disks); |
595 | for (i = 0; i < rs->md.raid_disks; i++) { | 1112 | for (i = 0; i < rs->md.raid_disks; i++) { |
596 | DMEMIT(" -"); /* metadata device */ | 1113 | if (rs->dev[i].meta_dev) |
1114 | DMEMIT(" %s", rs->dev[i].meta_dev->name); | ||
1115 | else | ||
1116 | DMEMIT(" -"); | ||
597 | 1117 | ||
598 | if (rs->dev[i].data_dev) | 1118 | if (rs->dev[i].data_dev) |
599 | DMEMIT(" %s", rs->dev[i].data_dev->name); | 1119 | DMEMIT(" %s", rs->dev[i].data_dev->name); |
@@ -650,12 +1170,13 @@ static void raid_resume(struct dm_target *ti) | |||
650 | { | 1170 | { |
651 | struct raid_set *rs = ti->private; | 1171 | struct raid_set *rs = ti->private; |
652 | 1172 | ||
1173 | bitmap_load(&rs->md); | ||
653 | mddev_resume(&rs->md); | 1174 | mddev_resume(&rs->md); |
654 | } | 1175 | } |
655 | 1176 | ||
656 | static struct target_type raid_target = { | 1177 | static struct target_type raid_target = { |
657 | .name = "raid", | 1178 | .name = "raid", |
658 | .version = {1, 0, 0}, | 1179 | .version = {1, 1, 0}, |
659 | .module = THIS_MODULE, | 1180 | .module = THIS_MODULE, |
660 | .ctr = raid_ctr, | 1181 | .ctr = raid_ctr, |
661 | .dtr = raid_dtr, | 1182 | .dtr = raid_dtr, |
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index 135c2f1fdbf..d1f1d701710 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c | |||
@@ -58,25 +58,30 @@ | |||
58 | #define NUM_SNAPSHOT_HDR_CHUNKS 1 | 58 | #define NUM_SNAPSHOT_HDR_CHUNKS 1 |
59 | 59 | ||
60 | struct disk_header { | 60 | struct disk_header { |
61 | uint32_t magic; | 61 | __le32 magic; |
62 | 62 | ||
63 | /* | 63 | /* |
64 | * Is this snapshot valid. There is no way of recovering | 64 | * Is this snapshot valid. There is no way of recovering |
65 | * an invalid snapshot. | 65 | * an invalid snapshot. |
66 | */ | 66 | */ |
67 | uint32_t valid; | 67 | __le32 valid; |
68 | 68 | ||
69 | /* | 69 | /* |
70 | * Simple, incrementing version. no backward | 70 | * Simple, incrementing version. no backward |
71 | * compatibility. | 71 | * compatibility. |
72 | */ | 72 | */ |
73 | uint32_t version; | 73 | __le32 version; |
74 | 74 | ||
75 | /* In sectors */ | 75 | /* In sectors */ |
76 | uint32_t chunk_size; | 76 | __le32 chunk_size; |
77 | }; | 77 | } __packed; |
78 | 78 | ||
79 | struct disk_exception { | 79 | struct disk_exception { |
80 | __le64 old_chunk; | ||
81 | __le64 new_chunk; | ||
82 | } __packed; | ||
83 | |||
84 | struct core_exception { | ||
80 | uint64_t old_chunk; | 85 | uint64_t old_chunk; |
81 | uint64_t new_chunk; | 86 | uint64_t new_chunk; |
82 | }; | 87 | }; |
@@ -169,10 +174,9 @@ static int alloc_area(struct pstore *ps) | |||
169 | if (!ps->area) | 174 | if (!ps->area) |
170 | goto err_area; | 175 | goto err_area; |
171 | 176 | ||
172 | ps->zero_area = vmalloc(len); | 177 | ps->zero_area = vzalloc(len); |
173 | if (!ps->zero_area) | 178 | if (!ps->zero_area) |
174 | goto err_zero_area; | 179 | goto err_zero_area; |
175 | memset(ps->zero_area, 0, len); | ||
176 | 180 | ||
177 | ps->header_area = vmalloc(len); | 181 | ps->header_area = vmalloc(len); |
178 | if (!ps->header_area) | 182 | if (!ps->header_area) |
@@ -396,32 +400,32 @@ static struct disk_exception *get_exception(struct pstore *ps, uint32_t index) | |||
396 | } | 400 | } |
397 | 401 | ||
398 | static void read_exception(struct pstore *ps, | 402 | static void read_exception(struct pstore *ps, |
399 | uint32_t index, struct disk_exception *result) | 403 | uint32_t index, struct core_exception *result) |
400 | { | 404 | { |
401 | struct disk_exception *e = get_exception(ps, index); | 405 | struct disk_exception *de = get_exception(ps, index); |
402 | 406 | ||
403 | /* copy it */ | 407 | /* copy it */ |
404 | result->old_chunk = le64_to_cpu(e->old_chunk); | 408 | result->old_chunk = le64_to_cpu(de->old_chunk); |
405 | result->new_chunk = le64_to_cpu(e->new_chunk); | 409 | result->new_chunk = le64_to_cpu(de->new_chunk); |
406 | } | 410 | } |
407 | 411 | ||
408 | static void write_exception(struct pstore *ps, | 412 | static void write_exception(struct pstore *ps, |
409 | uint32_t index, struct disk_exception *de) | 413 | uint32_t index, struct core_exception *e) |
410 | { | 414 | { |
411 | struct disk_exception *e = get_exception(ps, index); | 415 | struct disk_exception *de = get_exception(ps, index); |
412 | 416 | ||
413 | /* copy it */ | 417 | /* copy it */ |
414 | e->old_chunk = cpu_to_le64(de->old_chunk); | 418 | de->old_chunk = cpu_to_le64(e->old_chunk); |
415 | e->new_chunk = cpu_to_le64(de->new_chunk); | 419 | de->new_chunk = cpu_to_le64(e->new_chunk); |
416 | } | 420 | } |
417 | 421 | ||
418 | static void clear_exception(struct pstore *ps, uint32_t index) | 422 | static void clear_exception(struct pstore *ps, uint32_t index) |
419 | { | 423 | { |
420 | struct disk_exception *e = get_exception(ps, index); | 424 | struct disk_exception *de = get_exception(ps, index); |
421 | 425 | ||
422 | /* clear it */ | 426 | /* clear it */ |
423 | e->old_chunk = 0; | 427 | de->old_chunk = 0; |
424 | e->new_chunk = 0; | 428 | de->new_chunk = 0; |
425 | } | 429 | } |
426 | 430 | ||
427 | /* | 431 | /* |
@@ -437,13 +441,13 @@ static int insert_exceptions(struct pstore *ps, | |||
437 | { | 441 | { |
438 | int r; | 442 | int r; |
439 | unsigned int i; | 443 | unsigned int i; |
440 | struct disk_exception de; | 444 | struct core_exception e; |
441 | 445 | ||
442 | /* presume the area is full */ | 446 | /* presume the area is full */ |
443 | *full = 1; | 447 | *full = 1; |
444 | 448 | ||
445 | for (i = 0; i < ps->exceptions_per_area; i++) { | 449 | for (i = 0; i < ps->exceptions_per_area; i++) { |
446 | read_exception(ps, i, &de); | 450 | read_exception(ps, i, &e); |
447 | 451 | ||
448 | /* | 452 | /* |
449 | * If the new_chunk is pointing at the start of | 453 | * If the new_chunk is pointing at the start of |
@@ -451,7 +455,7 @@ static int insert_exceptions(struct pstore *ps, | |||
451 | * is we know that we've hit the end of the | 455 | * is we know that we've hit the end of the |
452 | * exceptions. Therefore the area is not full. | 456 | * exceptions. Therefore the area is not full. |
453 | */ | 457 | */ |
454 | if (de.new_chunk == 0LL) { | 458 | if (e.new_chunk == 0LL) { |
455 | ps->current_committed = i; | 459 | ps->current_committed = i; |
456 | *full = 0; | 460 | *full = 0; |
457 | break; | 461 | break; |
@@ -460,13 +464,13 @@ static int insert_exceptions(struct pstore *ps, | |||
460 | /* | 464 | /* |
461 | * Keep track of the start of the free chunks. | 465 | * Keep track of the start of the free chunks. |
462 | */ | 466 | */ |
463 | if (ps->next_free <= de.new_chunk) | 467 | if (ps->next_free <= e.new_chunk) |
464 | ps->next_free = de.new_chunk + 1; | 468 | ps->next_free = e.new_chunk + 1; |
465 | 469 | ||
466 | /* | 470 | /* |
467 | * Otherwise we add the exception to the snapshot. | 471 | * Otherwise we add the exception to the snapshot. |
468 | */ | 472 | */ |
469 | r = callback(callback_context, de.old_chunk, de.new_chunk); | 473 | r = callback(callback_context, e.old_chunk, e.new_chunk); |
470 | if (r) | 474 | if (r) |
471 | return r; | 475 | return r; |
472 | } | 476 | } |
@@ -563,7 +567,7 @@ static int persistent_read_metadata(struct dm_exception_store *store, | |||
563 | ps->exceptions_per_area = (ps->store->chunk_size << SECTOR_SHIFT) / | 567 | ps->exceptions_per_area = (ps->store->chunk_size << SECTOR_SHIFT) / |
564 | sizeof(struct disk_exception); | 568 | sizeof(struct disk_exception); |
565 | ps->callbacks = dm_vcalloc(ps->exceptions_per_area, | 569 | ps->callbacks = dm_vcalloc(ps->exceptions_per_area, |
566 | sizeof(*ps->callbacks)); | 570 | sizeof(*ps->callbacks)); |
567 | if (!ps->callbacks) | 571 | if (!ps->callbacks) |
568 | return -ENOMEM; | 572 | return -ENOMEM; |
569 | 573 | ||
@@ -641,12 +645,12 @@ static void persistent_commit_exception(struct dm_exception_store *store, | |||
641 | { | 645 | { |
642 | unsigned int i; | 646 | unsigned int i; |
643 | struct pstore *ps = get_info(store); | 647 | struct pstore *ps = get_info(store); |
644 | struct disk_exception de; | 648 | struct core_exception ce; |
645 | struct commit_callback *cb; | 649 | struct commit_callback *cb; |
646 | 650 | ||
647 | de.old_chunk = e->old_chunk; | 651 | ce.old_chunk = e->old_chunk; |
648 | de.new_chunk = e->new_chunk; | 652 | ce.new_chunk = e->new_chunk; |
649 | write_exception(ps, ps->current_committed++, &de); | 653 | write_exception(ps, ps->current_committed++, &ce); |
650 | 654 | ||
651 | /* | 655 | /* |
652 | * Add the callback to the back of the array. This code | 656 | * Add the callback to the back of the array. This code |
@@ -670,7 +674,7 @@ static void persistent_commit_exception(struct dm_exception_store *store, | |||
670 | * If we completely filled the current area, then wipe the next one. | 674 | * If we completely filled the current area, then wipe the next one. |
671 | */ | 675 | */ |
672 | if ((ps->current_committed == ps->exceptions_per_area) && | 676 | if ((ps->current_committed == ps->exceptions_per_area) && |
673 | zero_disk_area(ps, ps->current_area + 1)) | 677 | zero_disk_area(ps, ps->current_area + 1)) |
674 | ps->valid = 0; | 678 | ps->valid = 0; |
675 | 679 | ||
676 | /* | 680 | /* |
@@ -701,7 +705,7 @@ static int persistent_prepare_merge(struct dm_exception_store *store, | |||
701 | chunk_t *last_new_chunk) | 705 | chunk_t *last_new_chunk) |
702 | { | 706 | { |
703 | struct pstore *ps = get_info(store); | 707 | struct pstore *ps = get_info(store); |
704 | struct disk_exception de; | 708 | struct core_exception ce; |
705 | int nr_consecutive; | 709 | int nr_consecutive; |
706 | int r; | 710 | int r; |
707 | 711 | ||
@@ -722,9 +726,9 @@ static int persistent_prepare_merge(struct dm_exception_store *store, | |||
722 | ps->current_committed = ps->exceptions_per_area; | 726 | ps->current_committed = ps->exceptions_per_area; |
723 | } | 727 | } |
724 | 728 | ||
725 | read_exception(ps, ps->current_committed - 1, &de); | 729 | read_exception(ps, ps->current_committed - 1, &ce); |
726 | *last_old_chunk = de.old_chunk; | 730 | *last_old_chunk = ce.old_chunk; |
727 | *last_new_chunk = de.new_chunk; | 731 | *last_new_chunk = ce.new_chunk; |
728 | 732 | ||
729 | /* | 733 | /* |
730 | * Find number of consecutive chunks within the current area, | 734 | * Find number of consecutive chunks within the current area, |
@@ -733,9 +737,9 @@ static int persistent_prepare_merge(struct dm_exception_store *store, | |||
733 | for (nr_consecutive = 1; nr_consecutive < ps->current_committed; | 737 | for (nr_consecutive = 1; nr_consecutive < ps->current_committed; |
734 | nr_consecutive++) { | 738 | nr_consecutive++) { |
735 | read_exception(ps, ps->current_committed - 1 - nr_consecutive, | 739 | read_exception(ps, ps->current_committed - 1 - nr_consecutive, |
736 | &de); | 740 | &ce); |
737 | if (de.old_chunk != *last_old_chunk - nr_consecutive || | 741 | if (ce.old_chunk != *last_old_chunk - nr_consecutive || |
738 | de.new_chunk != *last_new_chunk - nr_consecutive) | 742 | ce.new_chunk != *last_new_chunk - nr_consecutive) |
739 | break; | 743 | break; |
740 | } | 744 | } |
741 | 745 | ||
@@ -753,7 +757,7 @@ static int persistent_commit_merge(struct dm_exception_store *store, | |||
753 | for (i = 0; i < nr_merged; i++) | 757 | for (i = 0; i < nr_merged; i++) |
754 | clear_exception(ps, ps->current_committed - 1 - i); | 758 | clear_exception(ps, ps->current_committed - 1 - i); |
755 | 759 | ||
756 | r = area_io(ps, WRITE); | 760 | r = area_io(ps, WRITE_FLUSH_FUA); |
757 | if (r < 0) | 761 | if (r < 0) |
758 | return r; | 762 | return r; |
759 | 763 | ||
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 9ecff5f3023..6f758870fc1 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c | |||
@@ -30,16 +30,6 @@ static const char dm_snapshot_merge_target_name[] = "snapshot-merge"; | |||
30 | ((ti)->type->name == dm_snapshot_merge_target_name) | 30 | ((ti)->type->name == dm_snapshot_merge_target_name) |
31 | 31 | ||
32 | /* | 32 | /* |
33 | * The percentage increment we will wake up users at | ||
34 | */ | ||
35 | #define WAKE_UP_PERCENT 5 | ||
36 | |||
37 | /* | ||
38 | * kcopyd priority of snapshot operations | ||
39 | */ | ||
40 | #define SNAPSHOT_COPY_PRIORITY 2 | ||
41 | |||
42 | /* | ||
43 | * The size of the mempool used to track chunks in use. | 33 | * The size of the mempool used to track chunks in use. |
44 | */ | 34 | */ |
45 | #define MIN_IOS 256 | 35 | #define MIN_IOS 256 |
@@ -180,6 +170,13 @@ struct dm_snap_pending_exception { | |||
180 | * kcopyd. | 170 | * kcopyd. |
181 | */ | 171 | */ |
182 | int started; | 172 | int started; |
173 | |||
174 | /* | ||
175 | * For writing a complete chunk, bypassing the copy. | ||
176 | */ | ||
177 | struct bio *full_bio; | ||
178 | bio_end_io_t *full_bio_end_io; | ||
179 | void *full_bio_private; | ||
183 | }; | 180 | }; |
184 | 181 | ||
185 | /* | 182 | /* |
@@ -1055,8 +1052,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1055 | 1052 | ||
1056 | s = kmalloc(sizeof(*s), GFP_KERNEL); | 1053 | s = kmalloc(sizeof(*s), GFP_KERNEL); |
1057 | if (!s) { | 1054 | if (!s) { |
1058 | ti->error = "Cannot allocate snapshot context private " | 1055 | ti->error = "Cannot allocate private snapshot structure"; |
1059 | "structure"; | ||
1060 | r = -ENOMEM; | 1056 | r = -ENOMEM; |
1061 | goto bad; | 1057 | goto bad; |
1062 | } | 1058 | } |
@@ -1380,6 +1376,7 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success) | |||
1380 | struct dm_snapshot *s = pe->snap; | 1376 | struct dm_snapshot *s = pe->snap; |
1381 | struct bio *origin_bios = NULL; | 1377 | struct bio *origin_bios = NULL; |
1382 | struct bio *snapshot_bios = NULL; | 1378 | struct bio *snapshot_bios = NULL; |
1379 | struct bio *full_bio = NULL; | ||
1383 | int error = 0; | 1380 | int error = 0; |
1384 | 1381 | ||
1385 | if (!success) { | 1382 | if (!success) { |
@@ -1415,10 +1412,15 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success) | |||
1415 | */ | 1412 | */ |
1416 | dm_insert_exception(&s->complete, e); | 1413 | dm_insert_exception(&s->complete, e); |
1417 | 1414 | ||
1418 | out: | 1415 | out: |
1419 | dm_remove_exception(&pe->e); | 1416 | dm_remove_exception(&pe->e); |
1420 | snapshot_bios = bio_list_get(&pe->snapshot_bios); | 1417 | snapshot_bios = bio_list_get(&pe->snapshot_bios); |
1421 | origin_bios = bio_list_get(&pe->origin_bios); | 1418 | origin_bios = bio_list_get(&pe->origin_bios); |
1419 | full_bio = pe->full_bio; | ||
1420 | if (full_bio) { | ||
1421 | full_bio->bi_end_io = pe->full_bio_end_io; | ||
1422 | full_bio->bi_private = pe->full_bio_private; | ||
1423 | } | ||
1422 | free_pending_exception(pe); | 1424 | free_pending_exception(pe); |
1423 | 1425 | ||
1424 | increment_pending_exceptions_done_count(); | 1426 | increment_pending_exceptions_done_count(); |
@@ -1426,10 +1428,15 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success) | |||
1426 | up_write(&s->lock); | 1428 | up_write(&s->lock); |
1427 | 1429 | ||
1428 | /* Submit any pending write bios */ | 1430 | /* Submit any pending write bios */ |
1429 | if (error) | 1431 | if (error) { |
1432 | if (full_bio) | ||
1433 | bio_io_error(full_bio); | ||
1430 | error_bios(snapshot_bios); | 1434 | error_bios(snapshot_bios); |
1431 | else | 1435 | } else { |
1436 | if (full_bio) | ||
1437 | bio_endio(full_bio, 0); | ||
1432 | flush_bios(snapshot_bios); | 1438 | flush_bios(snapshot_bios); |
1439 | } | ||
1433 | 1440 | ||
1434 | retry_origin_bios(s, origin_bios); | 1441 | retry_origin_bios(s, origin_bios); |
1435 | } | 1442 | } |
@@ -1480,8 +1487,33 @@ static void start_copy(struct dm_snap_pending_exception *pe) | |||
1480 | dest.count = src.count; | 1487 | dest.count = src.count; |
1481 | 1488 | ||
1482 | /* Hand over to kcopyd */ | 1489 | /* Hand over to kcopyd */ |
1483 | dm_kcopyd_copy(s->kcopyd_client, | 1490 | dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, copy_callback, pe); |
1484 | &src, 1, &dest, 0, copy_callback, pe); | 1491 | } |
1492 | |||
1493 | static void full_bio_end_io(struct bio *bio, int error) | ||
1494 | { | ||
1495 | void *callback_data = bio->bi_private; | ||
1496 | |||
1497 | dm_kcopyd_do_callback(callback_data, 0, error ? 1 : 0); | ||
1498 | } | ||
1499 | |||
1500 | static void start_full_bio(struct dm_snap_pending_exception *pe, | ||
1501 | struct bio *bio) | ||
1502 | { | ||
1503 | struct dm_snapshot *s = pe->snap; | ||
1504 | void *callback_data; | ||
1505 | |||
1506 | pe->full_bio = bio; | ||
1507 | pe->full_bio_end_io = bio->bi_end_io; | ||
1508 | pe->full_bio_private = bio->bi_private; | ||
1509 | |||
1510 | callback_data = dm_kcopyd_prepare_callback(s->kcopyd_client, | ||
1511 | copy_callback, pe); | ||
1512 | |||
1513 | bio->bi_end_io = full_bio_end_io; | ||
1514 | bio->bi_private = callback_data; | ||
1515 | |||
1516 | generic_make_request(bio); | ||
1485 | } | 1517 | } |
1486 | 1518 | ||
1487 | static struct dm_snap_pending_exception * | 1519 | static struct dm_snap_pending_exception * |
@@ -1519,6 +1551,7 @@ __find_pending_exception(struct dm_snapshot *s, | |||
1519 | bio_list_init(&pe->origin_bios); | 1551 | bio_list_init(&pe->origin_bios); |
1520 | bio_list_init(&pe->snapshot_bios); | 1552 | bio_list_init(&pe->snapshot_bios); |
1521 | pe->started = 0; | 1553 | pe->started = 0; |
1554 | pe->full_bio = NULL; | ||
1522 | 1555 | ||
1523 | if (s->store->type->prepare_exception(s->store, &pe->e)) { | 1556 | if (s->store->type->prepare_exception(s->store, &pe->e)) { |
1524 | free_pending_exception(pe); | 1557 | free_pending_exception(pe); |
@@ -1612,10 +1645,19 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, | |||
1612 | } | 1645 | } |
1613 | 1646 | ||
1614 | remap_exception(s, &pe->e, bio, chunk); | 1647 | remap_exception(s, &pe->e, bio, chunk); |
1615 | bio_list_add(&pe->snapshot_bios, bio); | ||
1616 | 1648 | ||
1617 | r = DM_MAPIO_SUBMITTED; | 1649 | r = DM_MAPIO_SUBMITTED; |
1618 | 1650 | ||
1651 | if (!pe->started && | ||
1652 | bio->bi_size == (s->store->chunk_size << SECTOR_SHIFT)) { | ||
1653 | pe->started = 1; | ||
1654 | up_write(&s->lock); | ||
1655 | start_full_bio(pe, bio); | ||
1656 | goto out; | ||
1657 | } | ||
1658 | |||
1659 | bio_list_add(&pe->snapshot_bios, bio); | ||
1660 | |||
1619 | if (!pe->started) { | 1661 | if (!pe->started) { |
1620 | /* this is protected by snap->lock */ | 1662 | /* this is protected by snap->lock */ |
1621 | pe->started = 1; | 1663 | pe->started = 1; |
@@ -1628,9 +1670,9 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, | |||
1628 | map_context->ptr = track_chunk(s, chunk); | 1670 | map_context->ptr = track_chunk(s, chunk); |
1629 | } | 1671 | } |
1630 | 1672 | ||
1631 | out_unlock: | 1673 | out_unlock: |
1632 | up_write(&s->lock); | 1674 | up_write(&s->lock); |
1633 | out: | 1675 | out: |
1634 | return r; | 1676 | return r; |
1635 | } | 1677 | } |
1636 | 1678 | ||
@@ -1974,7 +2016,7 @@ static int __origin_write(struct list_head *snapshots, sector_t sector, | |||
1974 | pe_to_start_now = pe; | 2016 | pe_to_start_now = pe; |
1975 | } | 2017 | } |
1976 | 2018 | ||
1977 | next_snapshot: | 2019 | next_snapshot: |
1978 | up_write(&snap->lock); | 2020 | up_write(&snap->lock); |
1979 | 2021 | ||
1980 | if (pe_to_start_now) { | 2022 | if (pe_to_start_now) { |
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 451c3bb176d..bc04518e9d8 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c | |||
@@ -17,7 +17,7 @@ | |||
17 | #include <linux/interrupt.h> | 17 | #include <linux/interrupt.h> |
18 | #include <linux/mutex.h> | 18 | #include <linux/mutex.h> |
19 | #include <linux/delay.h> | 19 | #include <linux/delay.h> |
20 | #include <asm/atomic.h> | 20 | #include <linux/atomic.h> |
21 | 21 | ||
22 | #define DM_MSG_PREFIX "table" | 22 | #define DM_MSG_PREFIX "table" |
23 | 23 | ||
@@ -54,7 +54,6 @@ struct dm_table { | |||
54 | sector_t *highs; | 54 | sector_t *highs; |
55 | struct dm_target *targets; | 55 | struct dm_target *targets; |
56 | 56 | ||
57 | unsigned discards_supported:1; | ||
58 | unsigned integrity_supported:1; | 57 | unsigned integrity_supported:1; |
59 | 58 | ||
60 | /* | 59 | /* |
@@ -154,12 +153,11 @@ void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size) | |||
154 | return NULL; | 153 | return NULL; |
155 | 154 | ||
156 | size = nmemb * elem_size; | 155 | size = nmemb * elem_size; |
157 | addr = vmalloc(size); | 156 | addr = vzalloc(size); |
158 | if (addr) | ||
159 | memset(addr, 0, size); | ||
160 | 157 | ||
161 | return addr; | 158 | return addr; |
162 | } | 159 | } |
160 | EXPORT_SYMBOL(dm_vcalloc); | ||
163 | 161 | ||
164 | /* | 162 | /* |
165 | * highs, and targets are managed as dynamic arrays during a | 163 | * highs, and targets are managed as dynamic arrays during a |
@@ -209,7 +207,6 @@ int dm_table_create(struct dm_table **result, fmode_t mode, | |||
209 | INIT_LIST_HEAD(&t->devices); | 207 | INIT_LIST_HEAD(&t->devices); |
210 | INIT_LIST_HEAD(&t->target_callbacks); | 208 | INIT_LIST_HEAD(&t->target_callbacks); |
211 | atomic_set(&t->holders, 0); | 209 | atomic_set(&t->holders, 0); |
212 | t->discards_supported = 1; | ||
213 | 210 | ||
214 | if (!num_targets) | 211 | if (!num_targets) |
215 | num_targets = KEYS_PER_NODE; | 212 | num_targets = KEYS_PER_NODE; |
@@ -281,6 +278,7 @@ void dm_table_get(struct dm_table *t) | |||
281 | { | 278 | { |
282 | atomic_inc(&t->holders); | 279 | atomic_inc(&t->holders); |
283 | } | 280 | } |
281 | EXPORT_SYMBOL(dm_table_get); | ||
284 | 282 | ||
285 | void dm_table_put(struct dm_table *t) | 283 | void dm_table_put(struct dm_table *t) |
286 | { | 284 | { |
@@ -290,6 +288,7 @@ void dm_table_put(struct dm_table *t) | |||
290 | smp_mb__before_atomic_dec(); | 288 | smp_mb__before_atomic_dec(); |
291 | atomic_dec(&t->holders); | 289 | atomic_dec(&t->holders); |
292 | } | 290 | } |
291 | EXPORT_SYMBOL(dm_table_put); | ||
293 | 292 | ||
294 | /* | 293 | /* |
295 | * Checks to see if we need to extend highs or targets. | 294 | * Checks to see if we need to extend highs or targets. |
@@ -455,13 +454,14 @@ static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode, | |||
455 | * Add a device to the list, or just increment the usage count if | 454 | * Add a device to the list, or just increment the usage count if |
456 | * it's already present. | 455 | * it's already present. |
457 | */ | 456 | */ |
458 | static int __table_get_device(struct dm_table *t, struct dm_target *ti, | 457 | int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, |
459 | const char *path, fmode_t mode, struct dm_dev **result) | 458 | struct dm_dev **result) |
460 | { | 459 | { |
461 | int r; | 460 | int r; |
462 | dev_t uninitialized_var(dev); | 461 | dev_t uninitialized_var(dev); |
463 | struct dm_dev_internal *dd; | 462 | struct dm_dev_internal *dd; |
464 | unsigned int major, minor; | 463 | unsigned int major, minor; |
464 | struct dm_table *t = ti->table; | ||
465 | 465 | ||
466 | BUG_ON(!t); | 466 | BUG_ON(!t); |
467 | 467 | ||
@@ -509,6 +509,7 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti, | |||
509 | *result = &dd->dm_dev; | 509 | *result = &dd->dm_dev; |
510 | return 0; | 510 | return 0; |
511 | } | 511 | } |
512 | EXPORT_SYMBOL(dm_get_device); | ||
512 | 513 | ||
513 | int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, | 514 | int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, |
514 | sector_t start, sector_t len, void *data) | 515 | sector_t start, sector_t len, void *data) |
@@ -539,23 +540,15 @@ int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, | |||
539 | * If not we'll force DM to use PAGE_SIZE or | 540 | * If not we'll force DM to use PAGE_SIZE or |
540 | * smaller I/O, just to be safe. | 541 | * smaller I/O, just to be safe. |
541 | */ | 542 | */ |
542 | 543 | if (dm_queue_merge_is_compulsory(q) && !ti->type->merge) | |
543 | if (q->merge_bvec_fn && !ti->type->merge) | ||
544 | blk_limits_max_hw_sectors(limits, | 544 | blk_limits_max_hw_sectors(limits, |
545 | (unsigned int) (PAGE_SIZE >> 9)); | 545 | (unsigned int) (PAGE_SIZE >> 9)); |
546 | return 0; | 546 | return 0; |
547 | } | 547 | } |
548 | EXPORT_SYMBOL_GPL(dm_set_device_limits); | 548 | EXPORT_SYMBOL_GPL(dm_set_device_limits); |
549 | 549 | ||
550 | int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, | ||
551 | struct dm_dev **result) | ||
552 | { | ||
553 | return __table_get_device(ti->table, ti, path, mode, result); | ||
554 | } | ||
555 | |||
556 | |||
557 | /* | 550 | /* |
558 | * Decrement a devices use count and remove it if necessary. | 551 | * Decrement a device's use count and remove it if necessary. |
559 | */ | 552 | */ |
560 | void dm_put_device(struct dm_target *ti, struct dm_dev *d) | 553 | void dm_put_device(struct dm_target *ti, struct dm_dev *d) |
561 | { | 554 | { |
@@ -568,6 +561,7 @@ void dm_put_device(struct dm_target *ti, struct dm_dev *d) | |||
568 | kfree(dd); | 561 | kfree(dd); |
569 | } | 562 | } |
570 | } | 563 | } |
564 | EXPORT_SYMBOL(dm_put_device); | ||
571 | 565 | ||
572 | /* | 566 | /* |
573 | * Checks to see if the target joins onto the end of the table. | 567 | * Checks to see if the target joins onto the end of the table. |
@@ -791,8 +785,9 @@ int dm_table_add_target(struct dm_table *t, const char *type, | |||
791 | 785 | ||
792 | t->highs[t->num_targets++] = tgt->begin + tgt->len - 1; | 786 | t->highs[t->num_targets++] = tgt->begin + tgt->len - 1; |
793 | 787 | ||
794 | if (!tgt->num_discard_requests) | 788 | if (!tgt->num_discard_requests && tgt->discards_supported) |
795 | t->discards_supported = 0; | 789 | DMWARN("%s: %s: ignoring discards_supported because num_discard_requests is zero.", |
790 | dm_device_name(t->md), type); | ||
796 | 791 | ||
797 | return 0; | 792 | return 0; |
798 | 793 | ||
@@ -802,6 +797,63 @@ int dm_table_add_target(struct dm_table *t, const char *type, | |||
802 | return r; | 797 | return r; |
803 | } | 798 | } |
804 | 799 | ||
800 | /* | ||
801 | * Target argument parsing helpers. | ||
802 | */ | ||
803 | static int validate_next_arg(struct dm_arg *arg, struct dm_arg_set *arg_set, | ||
804 | unsigned *value, char **error, unsigned grouped) | ||
805 | { | ||
806 | const char *arg_str = dm_shift_arg(arg_set); | ||
807 | |||
808 | if (!arg_str || | ||
809 | (sscanf(arg_str, "%u", value) != 1) || | ||
810 | (*value < arg->min) || | ||
811 | (*value > arg->max) || | ||
812 | (grouped && arg_set->argc < *value)) { | ||
813 | *error = arg->error; | ||
814 | return -EINVAL; | ||
815 | } | ||
816 | |||
817 | return 0; | ||
818 | } | ||
819 | |||
820 | int dm_read_arg(struct dm_arg *arg, struct dm_arg_set *arg_set, | ||
821 | unsigned *value, char **error) | ||
822 | { | ||
823 | return validate_next_arg(arg, arg_set, value, error, 0); | ||
824 | } | ||
825 | EXPORT_SYMBOL(dm_read_arg); | ||
826 | |||
827 | int dm_read_arg_group(struct dm_arg *arg, struct dm_arg_set *arg_set, | ||
828 | unsigned *value, char **error) | ||
829 | { | ||
830 | return validate_next_arg(arg, arg_set, value, error, 1); | ||
831 | } | ||
832 | EXPORT_SYMBOL(dm_read_arg_group); | ||
833 | |||
834 | const char *dm_shift_arg(struct dm_arg_set *as) | ||
835 | { | ||
836 | char *r; | ||
837 | |||
838 | if (as->argc) { | ||
839 | as->argc--; | ||
840 | r = *as->argv; | ||
841 | as->argv++; | ||
842 | return r; | ||
843 | } | ||
844 | |||
845 | return NULL; | ||
846 | } | ||
847 | EXPORT_SYMBOL(dm_shift_arg); | ||
848 | |||
849 | void dm_consume_args(struct dm_arg_set *as, unsigned num_args) | ||
850 | { | ||
851 | BUG_ON(as->argc < num_args); | ||
852 | as->argc -= num_args; | ||
853 | as->argv += num_args; | ||
854 | } | ||
855 | EXPORT_SYMBOL(dm_consume_args); | ||
856 | |||
805 | static int dm_table_set_type(struct dm_table *t) | 857 | static int dm_table_set_type(struct dm_table *t) |
806 | { | 858 | { |
807 | unsigned i; | 859 | unsigned i; |
@@ -1077,11 +1129,13 @@ void dm_table_event(struct dm_table *t) | |||
1077 | t->event_fn(t->event_context); | 1129 | t->event_fn(t->event_context); |
1078 | mutex_unlock(&_event_lock); | 1130 | mutex_unlock(&_event_lock); |
1079 | } | 1131 | } |
1132 | EXPORT_SYMBOL(dm_table_event); | ||
1080 | 1133 | ||
1081 | sector_t dm_table_get_size(struct dm_table *t) | 1134 | sector_t dm_table_get_size(struct dm_table *t) |
1082 | { | 1135 | { |
1083 | return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0; | 1136 | return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0; |
1084 | } | 1137 | } |
1138 | EXPORT_SYMBOL(dm_table_get_size); | ||
1085 | 1139 | ||
1086 | struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index) | 1140 | struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index) |
1087 | { | 1141 | { |
@@ -1184,19 +1238,72 @@ static void dm_table_set_integrity(struct dm_table *t) | |||
1184 | return; | 1238 | return; |
1185 | 1239 | ||
1186 | template_disk = dm_table_get_integrity_disk(t, true); | 1240 | template_disk = dm_table_get_integrity_disk(t, true); |
1187 | if (!template_disk && | 1241 | if (template_disk) |
1188 | blk_integrity_is_initialized(dm_disk(t->md))) { | 1242 | blk_integrity_register(dm_disk(t->md), |
1243 | blk_get_integrity(template_disk)); | ||
1244 | else if (blk_integrity_is_initialized(dm_disk(t->md))) | ||
1189 | DMWARN("%s: device no longer has a valid integrity profile", | 1245 | DMWARN("%s: device no longer has a valid integrity profile", |
1190 | dm_device_name(t->md)); | 1246 | dm_device_name(t->md)); |
1191 | return; | 1247 | else |
1248 | DMWARN("%s: unable to establish an integrity profile", | ||
1249 | dm_device_name(t->md)); | ||
1250 | } | ||
1251 | |||
1252 | static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev, | ||
1253 | sector_t start, sector_t len, void *data) | ||
1254 | { | ||
1255 | unsigned flush = (*(unsigned *)data); | ||
1256 | struct request_queue *q = bdev_get_queue(dev->bdev); | ||
1257 | |||
1258 | return q && (q->flush_flags & flush); | ||
1259 | } | ||
1260 | |||
1261 | static bool dm_table_supports_flush(struct dm_table *t, unsigned flush) | ||
1262 | { | ||
1263 | struct dm_target *ti; | ||
1264 | unsigned i = 0; | ||
1265 | |||
1266 | /* | ||
1267 | * Require at least one underlying device to support flushes. | ||
1268 | * t->devices includes internal dm devices such as mirror logs | ||
1269 | * so we need to use iterate_devices here, which targets | ||
1270 | * supporting flushes must provide. | ||
1271 | */ | ||
1272 | while (i < dm_table_get_num_targets(t)) { | ||
1273 | ti = dm_table_get_target(t, i++); | ||
1274 | |||
1275 | if (!ti->num_flush_requests) | ||
1276 | continue; | ||
1277 | |||
1278 | if (ti->type->iterate_devices && | ||
1279 | ti->type->iterate_devices(ti, device_flush_capable, &flush)) | ||
1280 | return 1; | ||
1281 | } | ||
1282 | |||
1283 | return 0; | ||
1284 | } | ||
1285 | |||
1286 | static bool dm_table_discard_zeroes_data(struct dm_table *t) | ||
1287 | { | ||
1288 | struct dm_target *ti; | ||
1289 | unsigned i = 0; | ||
1290 | |||
1291 | /* Ensure that all targets supports discard_zeroes_data. */ | ||
1292 | while (i < dm_table_get_num_targets(t)) { | ||
1293 | ti = dm_table_get_target(t, i++); | ||
1294 | |||
1295 | if (ti->discard_zeroes_data_unsupported) | ||
1296 | return 0; | ||
1192 | } | 1297 | } |
1193 | blk_integrity_register(dm_disk(t->md), | 1298 | |
1194 | blk_get_integrity(template_disk)); | 1299 | return 1; |
1195 | } | 1300 | } |
1196 | 1301 | ||
1197 | void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, | 1302 | void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, |
1198 | struct queue_limits *limits) | 1303 | struct queue_limits *limits) |
1199 | { | 1304 | { |
1305 | unsigned flush = 0; | ||
1306 | |||
1200 | /* | 1307 | /* |
1201 | * Copy table's limits to the DM device's request_queue | 1308 | * Copy table's limits to the DM device's request_queue |
1202 | */ | 1309 | */ |
@@ -1207,6 +1314,16 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, | |||
1207 | else | 1314 | else |
1208 | queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); | 1315 | queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); |
1209 | 1316 | ||
1317 | if (dm_table_supports_flush(t, REQ_FLUSH)) { | ||
1318 | flush |= REQ_FLUSH; | ||
1319 | if (dm_table_supports_flush(t, REQ_FUA)) | ||
1320 | flush |= REQ_FUA; | ||
1321 | } | ||
1322 | blk_queue_flush(q, flush); | ||
1323 | |||
1324 | if (!dm_table_discard_zeroes_data(t)) | ||
1325 | q->limits.discard_zeroes_data = 0; | ||
1326 | |||
1210 | dm_table_set_integrity(t); | 1327 | dm_table_set_integrity(t); |
1211 | 1328 | ||
1212 | /* | 1329 | /* |
@@ -1237,6 +1354,7 @@ fmode_t dm_table_get_mode(struct dm_table *t) | |||
1237 | { | 1354 | { |
1238 | return t->mode; | 1355 | return t->mode; |
1239 | } | 1356 | } |
1357 | EXPORT_SYMBOL(dm_table_get_mode); | ||
1240 | 1358 | ||
1241 | static void suspend_targets(struct dm_table *t, unsigned postsuspend) | 1359 | static void suspend_targets(struct dm_table *t, unsigned postsuspend) |
1242 | { | 1360 | { |
@@ -1345,6 +1463,7 @@ struct mapped_device *dm_table_get_md(struct dm_table *t) | |||
1345 | { | 1463 | { |
1346 | return t->md; | 1464 | return t->md; |
1347 | } | 1465 | } |
1466 | EXPORT_SYMBOL(dm_table_get_md); | ||
1348 | 1467 | ||
1349 | static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev, | 1468 | static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev, |
1350 | sector_t start, sector_t len, void *data) | 1469 | sector_t start, sector_t len, void *data) |
@@ -1359,19 +1478,19 @@ bool dm_table_supports_discards(struct dm_table *t) | |||
1359 | struct dm_target *ti; | 1478 | struct dm_target *ti; |
1360 | unsigned i = 0; | 1479 | unsigned i = 0; |
1361 | 1480 | ||
1362 | if (!t->discards_supported) | ||
1363 | return 0; | ||
1364 | |||
1365 | /* | 1481 | /* |
1366 | * Unless any target used by the table set discards_supported, | 1482 | * Unless any target used by the table set discards_supported, |
1367 | * require at least one underlying device to support discards. | 1483 | * require at least one underlying device to support discards. |
1368 | * t->devices includes internal dm devices such as mirror logs | 1484 | * t->devices includes internal dm devices such as mirror logs |
1369 | * so we need to use iterate_devices here, which targets | 1485 | * so we need to use iterate_devices here, which targets |
1370 | * supporting discard must provide. | 1486 | * supporting discard selectively must provide. |
1371 | */ | 1487 | */ |
1372 | while (i < dm_table_get_num_targets(t)) { | 1488 | while (i < dm_table_get_num_targets(t)) { |
1373 | ti = dm_table_get_target(t, i++); | 1489 | ti = dm_table_get_target(t, i++); |
1374 | 1490 | ||
1491 | if (!ti->num_discard_requests) | ||
1492 | continue; | ||
1493 | |||
1375 | if (ti->discards_supported) | 1494 | if (ti->discards_supported) |
1376 | return 1; | 1495 | return 1; |
1377 | 1496 | ||
@@ -1382,13 +1501,3 @@ bool dm_table_supports_discards(struct dm_table *t) | |||
1382 | 1501 | ||
1383 | return 0; | 1502 | return 0; |
1384 | } | 1503 | } |
1385 | |||
1386 | EXPORT_SYMBOL(dm_vcalloc); | ||
1387 | EXPORT_SYMBOL(dm_get_device); | ||
1388 | EXPORT_SYMBOL(dm_put_device); | ||
1389 | EXPORT_SYMBOL(dm_table_event); | ||
1390 | EXPORT_SYMBOL(dm_table_get_size); | ||
1391 | EXPORT_SYMBOL(dm_table_get_mode); | ||
1392 | EXPORT_SYMBOL(dm_table_get_md); | ||
1393 | EXPORT_SYMBOL(dm_table_put); | ||
1394 | EXPORT_SYMBOL(dm_table_get); | ||
diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 0cf68b47887..52b39f335bb 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c | |||
@@ -37,6 +37,8 @@ static const char *_name = DM_NAME; | |||
37 | static unsigned int major = 0; | 37 | static unsigned int major = 0; |
38 | static unsigned int _major = 0; | 38 | static unsigned int _major = 0; |
39 | 39 | ||
40 | static DEFINE_IDR(_minor_idr); | ||
41 | |||
40 | static DEFINE_SPINLOCK(_minor_lock); | 42 | static DEFINE_SPINLOCK(_minor_lock); |
41 | /* | 43 | /* |
42 | * For bio-based dm. | 44 | * For bio-based dm. |
@@ -109,6 +111,7 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); | |||
109 | #define DMF_FREEING 3 | 111 | #define DMF_FREEING 3 |
110 | #define DMF_DELETING 4 | 112 | #define DMF_DELETING 4 |
111 | #define DMF_NOFLUSH_SUSPENDING 5 | 113 | #define DMF_NOFLUSH_SUSPENDING 5 |
114 | #define DMF_MERGE_IS_OPTIONAL 6 | ||
112 | 115 | ||
113 | /* | 116 | /* |
114 | * Work processed by per-device workqueue. | 117 | * Work processed by per-device workqueue. |
@@ -313,6 +316,12 @@ static void __exit dm_exit(void) | |||
313 | 316 | ||
314 | while (i--) | 317 | while (i--) |
315 | _exits[i](); | 318 | _exits[i](); |
319 | |||
320 | /* | ||
321 | * Should be empty by this point. | ||
322 | */ | ||
323 | idr_remove_all(&_minor_idr); | ||
324 | idr_destroy(&_minor_idr); | ||
316 | } | 325 | } |
317 | 326 | ||
318 | /* | 327 | /* |
@@ -1171,7 +1180,8 @@ static int __clone_and_map_discard(struct clone_info *ci) | |||
1171 | 1180 | ||
1172 | /* | 1181 | /* |
1173 | * Even though the device advertised discard support, | 1182 | * Even though the device advertised discard support, |
1174 | * reconfiguration might have changed that since the | 1183 | * that does not mean every target supports it, and |
1184 | * reconfiguration might also have changed that since the | ||
1175 | * check was performed. | 1185 | * check was performed. |
1176 | */ | 1186 | */ |
1177 | if (!ti->num_discard_requests) | 1187 | if (!ti->num_discard_requests) |
@@ -1705,8 +1715,6 @@ static int dm_any_congested(void *congested_data, int bdi_bits) | |||
1705 | /*----------------------------------------------------------------- | 1715 | /*----------------------------------------------------------------- |
1706 | * An IDR is used to keep track of allocated minor numbers. | 1716 | * An IDR is used to keep track of allocated minor numbers. |
1707 | *---------------------------------------------------------------*/ | 1717 | *---------------------------------------------------------------*/ |
1708 | static DEFINE_IDR(_minor_idr); | ||
1709 | |||
1710 | static void free_minor(int minor) | 1718 | static void free_minor(int minor) |
1711 | { | 1719 | { |
1712 | spin_lock(&_minor_lock); | 1720 | spin_lock(&_minor_lock); |
@@ -1800,7 +1808,6 @@ static void dm_init_md_queue(struct mapped_device *md) | |||
1800 | blk_queue_make_request(md->queue, dm_request); | 1808 | blk_queue_make_request(md->queue, dm_request); |
1801 | blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); | 1809 | blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); |
1802 | blk_queue_merge_bvec(md->queue, dm_merge_bvec); | 1810 | blk_queue_merge_bvec(md->queue, dm_merge_bvec); |
1803 | blk_queue_flush(md->queue, REQ_FLUSH | REQ_FUA); | ||
1804 | } | 1811 | } |
1805 | 1812 | ||
1806 | /* | 1813 | /* |
@@ -1986,6 +1993,59 @@ static void __set_size(struct mapped_device *md, sector_t size) | |||
1986 | } | 1993 | } |
1987 | 1994 | ||
1988 | /* | 1995 | /* |
1996 | * Return 1 if the queue has a compulsory merge_bvec_fn function. | ||
1997 | * | ||
1998 | * If this function returns 0, then the device is either a non-dm | ||
1999 | * device without a merge_bvec_fn, or it is a dm device that is | ||
2000 | * able to split any bios it receives that are too big. | ||
2001 | */ | ||
2002 | int dm_queue_merge_is_compulsory(struct request_queue *q) | ||
2003 | { | ||
2004 | struct mapped_device *dev_md; | ||
2005 | |||
2006 | if (!q->merge_bvec_fn) | ||
2007 | return 0; | ||
2008 | |||
2009 | if (q->make_request_fn == dm_request) { | ||
2010 | dev_md = q->queuedata; | ||
2011 | if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags)) | ||
2012 | return 0; | ||
2013 | } | ||
2014 | |||
2015 | return 1; | ||
2016 | } | ||
2017 | |||
2018 | static int dm_device_merge_is_compulsory(struct dm_target *ti, | ||
2019 | struct dm_dev *dev, sector_t start, | ||
2020 | sector_t len, void *data) | ||
2021 | { | ||
2022 | struct block_device *bdev = dev->bdev; | ||
2023 | struct request_queue *q = bdev_get_queue(bdev); | ||
2024 | |||
2025 | return dm_queue_merge_is_compulsory(q); | ||
2026 | } | ||
2027 | |||
2028 | /* | ||
2029 | * Return 1 if it is acceptable to ignore merge_bvec_fn based | ||
2030 | * on the properties of the underlying devices. | ||
2031 | */ | ||
2032 | static int dm_table_merge_is_optional(struct dm_table *table) | ||
2033 | { | ||
2034 | unsigned i = 0; | ||
2035 | struct dm_target *ti; | ||
2036 | |||
2037 | while (i < dm_table_get_num_targets(table)) { | ||
2038 | ti = dm_table_get_target(table, i++); | ||
2039 | |||
2040 | if (ti->type->iterate_devices && | ||
2041 | ti->type->iterate_devices(ti, dm_device_merge_is_compulsory, NULL)) | ||
2042 | return 0; | ||
2043 | } | ||
2044 | |||
2045 | return 1; | ||
2046 | } | ||
2047 | |||
2048 | /* | ||
1989 | * Returns old map, which caller must destroy. | 2049 | * Returns old map, which caller must destroy. |
1990 | */ | 2050 | */ |
1991 | static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, | 2051 | static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, |
@@ -1995,6 +2055,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, | |||
1995 | struct request_queue *q = md->queue; | 2055 | struct request_queue *q = md->queue; |
1996 | sector_t size; | 2056 | sector_t size; |
1997 | unsigned long flags; | 2057 | unsigned long flags; |
2058 | int merge_is_optional; | ||
1998 | 2059 | ||
1999 | size = dm_table_get_size(t); | 2060 | size = dm_table_get_size(t); |
2000 | 2061 | ||
@@ -2020,10 +2081,16 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, | |||
2020 | 2081 | ||
2021 | __bind_mempools(md, t); | 2082 | __bind_mempools(md, t); |
2022 | 2083 | ||
2084 | merge_is_optional = dm_table_merge_is_optional(t); | ||
2085 | |||
2023 | write_lock_irqsave(&md->map_lock, flags); | 2086 | write_lock_irqsave(&md->map_lock, flags); |
2024 | old_map = md->map; | 2087 | old_map = md->map; |
2025 | md->map = t; | 2088 | md->map = t; |
2026 | dm_table_set_restrictions(t, q, limits); | 2089 | dm_table_set_restrictions(t, q, limits); |
2090 | if (merge_is_optional) | ||
2091 | set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); | ||
2092 | else | ||
2093 | clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); | ||
2027 | write_unlock_irqrestore(&md->map_lock, flags); | 2094 | write_unlock_irqrestore(&md->map_lock, flags); |
2028 | 2095 | ||
2029 | return old_map; | 2096 | return old_map; |
diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 1aaf16746da..6745dbd278a 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h | |||
@@ -66,6 +66,8 @@ int dm_table_alloc_md_mempools(struct dm_table *t); | |||
66 | void dm_table_free_md_mempools(struct dm_table *t); | 66 | void dm_table_free_md_mempools(struct dm_table *t); |
67 | struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t); | 67 | struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t); |
68 | 68 | ||
69 | int dm_queue_merge_is_compulsory(struct request_queue *q); | ||
70 | |||
69 | void dm_lock_md_type(struct mapped_device *md); | 71 | void dm_lock_md_type(struct mapped_device *md); |
70 | void dm_unlock_md_type(struct mapped_device *md); | 72 | void dm_unlock_md_type(struct mapped_device *md); |
71 | void dm_set_md_type(struct mapped_device *md, unsigned type); | 73 | void dm_set_md_type(struct mapped_device *md, unsigned type); |
diff --git a/drivers/md/linear.c b/drivers/md/linear.c index abfb59a61ed..6cd2c313e80 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c | |||
@@ -213,12 +213,6 @@ static int linear_run (mddev_t *mddev) | |||
213 | return md_integrity_register(mddev); | 213 | return md_integrity_register(mddev); |
214 | } | 214 | } |
215 | 215 | ||
216 | static void free_conf(struct rcu_head *head) | ||
217 | { | ||
218 | linear_conf_t *conf = container_of(head, linear_conf_t, rcu); | ||
219 | kfree(conf); | ||
220 | } | ||
221 | |||
222 | static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev) | 216 | static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev) |
223 | { | 217 | { |
224 | /* Adding a drive to a linear array allows the array to grow. | 218 | /* Adding a drive to a linear array allows the array to grow. |
@@ -247,7 +241,7 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev) | |||
247 | md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); | 241 | md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); |
248 | set_capacity(mddev->gendisk, mddev->array_sectors); | 242 | set_capacity(mddev->gendisk, mddev->array_sectors); |
249 | revalidate_disk(mddev->gendisk); | 243 | revalidate_disk(mddev->gendisk); |
250 | call_rcu(&oldconf->rcu, free_conf); | 244 | kfree_rcu(oldconf, rcu); |
251 | return 0; | 245 | return 0; |
252 | } | 246 | } |
253 | 247 | ||
diff --git a/drivers/md/linear.h b/drivers/md/linear.h index 0ce29b61605..2f2da05b2ce 100644 --- a/drivers/md/linear.h +++ b/drivers/md/linear.h | |||
@@ -10,9 +10,9 @@ typedef struct dev_info dev_info_t; | |||
10 | 10 | ||
11 | struct linear_private_data | 11 | struct linear_private_data |
12 | { | 12 | { |
13 | struct rcu_head rcu; | ||
13 | sector_t array_sectors; | 14 | sector_t array_sectors; |
14 | dev_info_t disks[0]; | 15 | dev_info_t disks[0]; |
15 | struct rcu_head rcu; | ||
16 | }; | 16 | }; |
17 | 17 | ||
18 | 18 | ||
diff --git a/drivers/md/md.c b/drivers/md/md.c index 91e31e260b4..5c95ccb5950 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
@@ -61,6 +61,11 @@ | |||
61 | static void autostart_arrays(int part); | 61 | static void autostart_arrays(int part); |
62 | #endif | 62 | #endif |
63 | 63 | ||
64 | /* pers_list is a list of registered personalities protected | ||
65 | * by pers_lock. | ||
66 | * pers_lock does extra service to protect accesses to | ||
67 | * mddev->thread when the mutex cannot be held. | ||
68 | */ | ||
64 | static LIST_HEAD(pers_list); | 69 | static LIST_HEAD(pers_list); |
65 | static DEFINE_SPINLOCK(pers_lock); | 70 | static DEFINE_SPINLOCK(pers_lock); |
66 | 71 | ||
@@ -215,6 +220,55 @@ struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, | |||
215 | } | 220 | } |
216 | EXPORT_SYMBOL_GPL(bio_clone_mddev); | 221 | EXPORT_SYMBOL_GPL(bio_clone_mddev); |
217 | 222 | ||
223 | void md_trim_bio(struct bio *bio, int offset, int size) | ||
224 | { | ||
225 | /* 'bio' is a cloned bio which we need to trim to match | ||
226 | * the given offset and size. | ||
227 | * This requires adjusting bi_sector, bi_size, and bi_io_vec | ||
228 | */ | ||
229 | int i; | ||
230 | struct bio_vec *bvec; | ||
231 | int sofar = 0; | ||
232 | |||
233 | size <<= 9; | ||
234 | if (offset == 0 && size == bio->bi_size) | ||
235 | return; | ||
236 | |||
237 | bio->bi_sector += offset; | ||
238 | bio->bi_size = size; | ||
239 | offset <<= 9; | ||
240 | clear_bit(BIO_SEG_VALID, &bio->bi_flags); | ||
241 | |||
242 | while (bio->bi_idx < bio->bi_vcnt && | ||
243 | bio->bi_io_vec[bio->bi_idx].bv_len <= offset) { | ||
244 | /* remove this whole bio_vec */ | ||
245 | offset -= bio->bi_io_vec[bio->bi_idx].bv_len; | ||
246 | bio->bi_idx++; | ||
247 | } | ||
248 | if (bio->bi_idx < bio->bi_vcnt) { | ||
249 | bio->bi_io_vec[bio->bi_idx].bv_offset += offset; | ||
250 | bio->bi_io_vec[bio->bi_idx].bv_len -= offset; | ||
251 | } | ||
252 | /* avoid any complications with bi_idx being non-zero*/ | ||
253 | if (bio->bi_idx) { | ||
254 | memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx, | ||
255 | (bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec)); | ||
256 | bio->bi_vcnt -= bio->bi_idx; | ||
257 | bio->bi_idx = 0; | ||
258 | } | ||
259 | /* Make sure vcnt and last bv are not too big */ | ||
260 | bio_for_each_segment(bvec, bio, i) { | ||
261 | if (sofar + bvec->bv_len > size) | ||
262 | bvec->bv_len = size - sofar; | ||
263 | if (bvec->bv_len == 0) { | ||
264 | bio->bi_vcnt = i; | ||
265 | break; | ||
266 | } | ||
267 | sofar += bvec->bv_len; | ||
268 | } | ||
269 | } | ||
270 | EXPORT_SYMBOL_GPL(md_trim_bio); | ||
271 | |||
218 | /* | 272 | /* |
219 | * We have a system wide 'event count' that is incremented | 273 | * We have a system wide 'event count' that is incremented |
220 | * on any 'interesting' event, and readers of /proc/mdstat | 274 | * on any 'interesting' event, and readers of /proc/mdstat |
@@ -690,7 +744,12 @@ static void mddev_unlock(mddev_t * mddev) | |||
690 | } else | 744 | } else |
691 | mutex_unlock(&mddev->reconfig_mutex); | 745 | mutex_unlock(&mddev->reconfig_mutex); |
692 | 746 | ||
747 | /* was we've dropped the mutex we need a spinlock to | ||
748 | * make sur the thread doesn't disappear | ||
749 | */ | ||
750 | spin_lock(&pers_lock); | ||
693 | md_wakeup_thread(mddev->thread); | 751 | md_wakeup_thread(mddev->thread); |
752 | spin_unlock(&pers_lock); | ||
694 | } | 753 | } |
695 | 754 | ||
696 | static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) | 755 | static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) |
@@ -757,6 +816,10 @@ static void free_disk_sb(mdk_rdev_t * rdev) | |||
757 | rdev->sb_start = 0; | 816 | rdev->sb_start = 0; |
758 | rdev->sectors = 0; | 817 | rdev->sectors = 0; |
759 | } | 818 | } |
819 | if (rdev->bb_page) { | ||
820 | put_page(rdev->bb_page); | ||
821 | rdev->bb_page = NULL; | ||
822 | } | ||
760 | } | 823 | } |
761 | 824 | ||
762 | 825 | ||
@@ -795,7 +858,7 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, | |||
795 | bio->bi_end_io = super_written; | 858 | bio->bi_end_io = super_written; |
796 | 859 | ||
797 | atomic_inc(&mddev->pending_writes); | 860 | atomic_inc(&mddev->pending_writes); |
798 | submit_bio(REQ_WRITE | REQ_SYNC | REQ_FLUSH | REQ_FUA, bio); | 861 | submit_bio(WRITE_FLUSH_FUA, bio); |
799 | } | 862 | } |
800 | 863 | ||
801 | void md_super_wait(mddev_t *mddev) | 864 | void md_super_wait(mddev_t *mddev) |
@@ -1025,7 +1088,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version | |||
1025 | ret = -EINVAL; | 1088 | ret = -EINVAL; |
1026 | 1089 | ||
1027 | bdevname(rdev->bdev, b); | 1090 | bdevname(rdev->bdev, b); |
1028 | sb = (mdp_super_t*)page_address(rdev->sb_page); | 1091 | sb = page_address(rdev->sb_page); |
1029 | 1092 | ||
1030 | if (sb->md_magic != MD_SB_MAGIC) { | 1093 | if (sb->md_magic != MD_SB_MAGIC) { |
1031 | printk(KERN_ERR "md: invalid raid superblock magic on %s\n", | 1094 | printk(KERN_ERR "md: invalid raid superblock magic on %s\n", |
@@ -1054,6 +1117,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version | |||
1054 | rdev->preferred_minor = sb->md_minor; | 1117 | rdev->preferred_minor = sb->md_minor; |
1055 | rdev->data_offset = 0; | 1118 | rdev->data_offset = 0; |
1056 | rdev->sb_size = MD_SB_BYTES; | 1119 | rdev->sb_size = MD_SB_BYTES; |
1120 | rdev->badblocks.shift = -1; | ||
1057 | 1121 | ||
1058 | if (sb->level == LEVEL_MULTIPATH) | 1122 | if (sb->level == LEVEL_MULTIPATH) |
1059 | rdev->desc_nr = -1; | 1123 | rdev->desc_nr = -1; |
@@ -1064,7 +1128,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version | |||
1064 | ret = 1; | 1128 | ret = 1; |
1065 | } else { | 1129 | } else { |
1066 | __u64 ev1, ev2; | 1130 | __u64 ev1, ev2; |
1067 | mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); | 1131 | mdp_super_t *refsb = page_address(refdev->sb_page); |
1068 | if (!uuid_equal(refsb, sb)) { | 1132 | if (!uuid_equal(refsb, sb)) { |
1069 | printk(KERN_WARNING "md: %s has different UUID to %s\n", | 1133 | printk(KERN_WARNING "md: %s has different UUID to %s\n", |
1070 | b, bdevname(refdev->bdev,b2)); | 1134 | b, bdevname(refdev->bdev,b2)); |
@@ -1084,8 +1148,11 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version | |||
1084 | ret = 0; | 1148 | ret = 0; |
1085 | } | 1149 | } |
1086 | rdev->sectors = rdev->sb_start; | 1150 | rdev->sectors = rdev->sb_start; |
1151 | /* Limit to 4TB as metadata cannot record more than that */ | ||
1152 | if (rdev->sectors >= (2ULL << 32)) | ||
1153 | rdev->sectors = (2ULL << 32) - 2; | ||
1087 | 1154 | ||
1088 | if (rdev->sectors < sb->size * 2 && sb->level > 1) | 1155 | if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) |
1089 | /* "this cannot possibly happen" ... */ | 1156 | /* "this cannot possibly happen" ... */ |
1090 | ret = -EINVAL; | 1157 | ret = -EINVAL; |
1091 | 1158 | ||
@@ -1099,7 +1166,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version | |||
1099 | static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) | 1166 | static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) |
1100 | { | 1167 | { |
1101 | mdp_disk_t *desc; | 1168 | mdp_disk_t *desc; |
1102 | mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); | 1169 | mdp_super_t *sb = page_address(rdev->sb_page); |
1103 | __u64 ev1 = md_event(sb); | 1170 | __u64 ev1 = md_event(sb); |
1104 | 1171 | ||
1105 | rdev->raid_disk = -1; | 1172 | rdev->raid_disk = -1; |
@@ -1119,7 +1186,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1119 | mddev->clevel[0] = 0; | 1186 | mddev->clevel[0] = 0; |
1120 | mddev->layout = sb->layout; | 1187 | mddev->layout = sb->layout; |
1121 | mddev->raid_disks = sb->raid_disks; | 1188 | mddev->raid_disks = sb->raid_disks; |
1122 | mddev->dev_sectors = sb->size * 2; | 1189 | mddev->dev_sectors = ((sector_t)sb->size) * 2; |
1123 | mddev->events = ev1; | 1190 | mddev->events = ev1; |
1124 | mddev->bitmap_info.offset = 0; | 1191 | mddev->bitmap_info.offset = 0; |
1125 | mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; | 1192 | mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; |
@@ -1230,7 +1297,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1230 | 1297 | ||
1231 | rdev->sb_size = MD_SB_BYTES; | 1298 | rdev->sb_size = MD_SB_BYTES; |
1232 | 1299 | ||
1233 | sb = (mdp_super_t*)page_address(rdev->sb_page); | 1300 | sb = page_address(rdev->sb_page); |
1234 | 1301 | ||
1235 | memset(sb, 0, sizeof(*sb)); | 1302 | memset(sb, 0, sizeof(*sb)); |
1236 | 1303 | ||
@@ -1361,6 +1428,11 @@ super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) | |||
1361 | rdev->sb_start = calc_dev_sboffset(rdev); | 1428 | rdev->sb_start = calc_dev_sboffset(rdev); |
1362 | if (!num_sectors || num_sectors > rdev->sb_start) | 1429 | if (!num_sectors || num_sectors > rdev->sb_start) |
1363 | num_sectors = rdev->sb_start; | 1430 | num_sectors = rdev->sb_start; |
1431 | /* Limit to 4TB as metadata cannot record more than that. | ||
1432 | * 4TB == 2^32 KB, or 2*2^32 sectors. | ||
1433 | */ | ||
1434 | if (num_sectors >= (2ULL << 32)) | ||
1435 | num_sectors = (2ULL << 32) - 2; | ||
1364 | md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, | 1436 | md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, |
1365 | rdev->sb_page); | 1437 | rdev->sb_page); |
1366 | md_super_wait(rdev->mddev); | 1438 | md_super_wait(rdev->mddev); |
@@ -1395,6 +1467,8 @@ static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb) | |||
1395 | return cpu_to_le32(csum); | 1467 | return cpu_to_le32(csum); |
1396 | } | 1468 | } |
1397 | 1469 | ||
1470 | static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, | ||
1471 | int acknowledged); | ||
1398 | static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) | 1472 | static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) |
1399 | { | 1473 | { |
1400 | struct mdp_superblock_1 *sb; | 1474 | struct mdp_superblock_1 *sb; |
@@ -1435,7 +1509,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) | |||
1435 | if (ret) return ret; | 1509 | if (ret) return ret; |
1436 | 1510 | ||
1437 | 1511 | ||
1438 | sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); | 1512 | sb = page_address(rdev->sb_page); |
1439 | 1513 | ||
1440 | if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || | 1514 | if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || |
1441 | sb->major_version != cpu_to_le32(1) || | 1515 | sb->major_version != cpu_to_le32(1) || |
@@ -1473,12 +1547,52 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) | |||
1473 | else | 1547 | else |
1474 | rdev->desc_nr = le32_to_cpu(sb->dev_number); | 1548 | rdev->desc_nr = le32_to_cpu(sb->dev_number); |
1475 | 1549 | ||
1550 | if (!rdev->bb_page) { | ||
1551 | rdev->bb_page = alloc_page(GFP_KERNEL); | ||
1552 | if (!rdev->bb_page) | ||
1553 | return -ENOMEM; | ||
1554 | } | ||
1555 | if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) && | ||
1556 | rdev->badblocks.count == 0) { | ||
1557 | /* need to load the bad block list. | ||
1558 | * Currently we limit it to one page. | ||
1559 | */ | ||
1560 | s32 offset; | ||
1561 | sector_t bb_sector; | ||
1562 | u64 *bbp; | ||
1563 | int i; | ||
1564 | int sectors = le16_to_cpu(sb->bblog_size); | ||
1565 | if (sectors > (PAGE_SIZE / 512)) | ||
1566 | return -EINVAL; | ||
1567 | offset = le32_to_cpu(sb->bblog_offset); | ||
1568 | if (offset == 0) | ||
1569 | return -EINVAL; | ||
1570 | bb_sector = (long long)offset; | ||
1571 | if (!sync_page_io(rdev, bb_sector, sectors << 9, | ||
1572 | rdev->bb_page, READ, true)) | ||
1573 | return -EIO; | ||
1574 | bbp = (u64 *)page_address(rdev->bb_page); | ||
1575 | rdev->badblocks.shift = sb->bblog_shift; | ||
1576 | for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) { | ||
1577 | u64 bb = le64_to_cpu(*bbp); | ||
1578 | int count = bb & (0x3ff); | ||
1579 | u64 sector = bb >> 10; | ||
1580 | sector <<= sb->bblog_shift; | ||
1581 | count <<= sb->bblog_shift; | ||
1582 | if (bb + 1 == 0) | ||
1583 | break; | ||
1584 | if (md_set_badblocks(&rdev->badblocks, | ||
1585 | sector, count, 1) == 0) | ||
1586 | return -EINVAL; | ||
1587 | } | ||
1588 | } else if (sb->bblog_offset == 0) | ||
1589 | rdev->badblocks.shift = -1; | ||
1590 | |||
1476 | if (!refdev) { | 1591 | if (!refdev) { |
1477 | ret = 1; | 1592 | ret = 1; |
1478 | } else { | 1593 | } else { |
1479 | __u64 ev1, ev2; | 1594 | __u64 ev1, ev2; |
1480 | struct mdp_superblock_1 *refsb = | 1595 | struct mdp_superblock_1 *refsb = page_address(refdev->sb_page); |
1481 | (struct mdp_superblock_1*)page_address(refdev->sb_page); | ||
1482 | 1596 | ||
1483 | if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || | 1597 | if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || |
1484 | sb->level != refsb->level || | 1598 | sb->level != refsb->level || |
@@ -1513,7 +1627,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) | |||
1513 | 1627 | ||
1514 | static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) | 1628 | static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) |
1515 | { | 1629 | { |
1516 | struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); | 1630 | struct mdp_superblock_1 *sb = page_address(rdev->sb_page); |
1517 | __u64 ev1 = le64_to_cpu(sb->events); | 1631 | __u64 ev1 = le64_to_cpu(sb->events); |
1518 | 1632 | ||
1519 | rdev->raid_disk = -1; | 1633 | rdev->raid_disk = -1; |
@@ -1619,13 +1733,12 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1619 | int max_dev, i; | 1733 | int max_dev, i; |
1620 | /* make rdev->sb match mddev and rdev data. */ | 1734 | /* make rdev->sb match mddev and rdev data. */ |
1621 | 1735 | ||
1622 | sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); | 1736 | sb = page_address(rdev->sb_page); |
1623 | 1737 | ||
1624 | sb->feature_map = 0; | 1738 | sb->feature_map = 0; |
1625 | sb->pad0 = 0; | 1739 | sb->pad0 = 0; |
1626 | sb->recovery_offset = cpu_to_le64(0); | 1740 | sb->recovery_offset = cpu_to_le64(0); |
1627 | memset(sb->pad1, 0, sizeof(sb->pad1)); | 1741 | memset(sb->pad1, 0, sizeof(sb->pad1)); |
1628 | memset(sb->pad2, 0, sizeof(sb->pad2)); | ||
1629 | memset(sb->pad3, 0, sizeof(sb->pad3)); | 1742 | memset(sb->pad3, 0, sizeof(sb->pad3)); |
1630 | 1743 | ||
1631 | sb->utime = cpu_to_le64((__u64)mddev->utime); | 1744 | sb->utime = cpu_to_le64((__u64)mddev->utime); |
@@ -1643,6 +1756,11 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1643 | sb->level = cpu_to_le32(mddev->level); | 1756 | sb->level = cpu_to_le32(mddev->level); |
1644 | sb->layout = cpu_to_le32(mddev->layout); | 1757 | sb->layout = cpu_to_le32(mddev->layout); |
1645 | 1758 | ||
1759 | if (test_bit(WriteMostly, &rdev->flags)) | ||
1760 | sb->devflags |= WriteMostly1; | ||
1761 | else | ||
1762 | sb->devflags &= ~WriteMostly1; | ||
1763 | |||
1646 | if (mddev->bitmap && mddev->bitmap_info.file == NULL) { | 1764 | if (mddev->bitmap && mddev->bitmap_info.file == NULL) { |
1647 | sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); | 1765 | sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); |
1648 | sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); | 1766 | sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); |
@@ -1665,6 +1783,40 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1665 | sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); | 1783 | sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); |
1666 | } | 1784 | } |
1667 | 1785 | ||
1786 | if (rdev->badblocks.count == 0) | ||
1787 | /* Nothing to do for bad blocks*/ ; | ||
1788 | else if (sb->bblog_offset == 0) | ||
1789 | /* Cannot record bad blocks on this device */ | ||
1790 | md_error(mddev, rdev); | ||
1791 | else { | ||
1792 | struct badblocks *bb = &rdev->badblocks; | ||
1793 | u64 *bbp = (u64 *)page_address(rdev->bb_page); | ||
1794 | u64 *p = bb->page; | ||
1795 | sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS); | ||
1796 | if (bb->changed) { | ||
1797 | unsigned seq; | ||
1798 | |||
1799 | retry: | ||
1800 | seq = read_seqbegin(&bb->lock); | ||
1801 | |||
1802 | memset(bbp, 0xff, PAGE_SIZE); | ||
1803 | |||
1804 | for (i = 0 ; i < bb->count ; i++) { | ||
1805 | u64 internal_bb = *p++; | ||
1806 | u64 store_bb = ((BB_OFFSET(internal_bb) << 10) | ||
1807 | | BB_LEN(internal_bb)); | ||
1808 | *bbp++ = cpu_to_le64(store_bb); | ||
1809 | } | ||
1810 | if (read_seqretry(&bb->lock, seq)) | ||
1811 | goto retry; | ||
1812 | |||
1813 | bb->sector = (rdev->sb_start + | ||
1814 | (int)le32_to_cpu(sb->bblog_offset)); | ||
1815 | bb->size = le16_to_cpu(sb->bblog_size); | ||
1816 | bb->changed = 0; | ||
1817 | } | ||
1818 | } | ||
1819 | |||
1668 | max_dev = 0; | 1820 | max_dev = 0; |
1669 | list_for_each_entry(rdev2, &mddev->disks, same_set) | 1821 | list_for_each_entry(rdev2, &mddev->disks, same_set) |
1670 | if (rdev2->desc_nr+1 > max_dev) | 1822 | if (rdev2->desc_nr+1 > max_dev) |
@@ -1724,7 +1876,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) | |||
1724 | num_sectors = max_sectors; | 1876 | num_sectors = max_sectors; |
1725 | rdev->sb_start = sb_start; | 1877 | rdev->sb_start = sb_start; |
1726 | } | 1878 | } |
1727 | sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page); | 1879 | sb = page_address(rdev->sb_page); |
1728 | sb->data_size = cpu_to_le64(num_sectors); | 1880 | sb->data_size = cpu_to_le64(num_sectors); |
1729 | sb->super_offset = rdev->sb_start; | 1881 | sb->super_offset = rdev->sb_start; |
1730 | sb->sb_csum = calc_sb_1_csum(sb); | 1882 | sb->sb_csum = calc_sb_1_csum(sb); |
@@ -1922,7 +2074,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) | |||
1922 | bd_link_disk_holder(rdev->bdev, mddev->gendisk); | 2074 | bd_link_disk_holder(rdev->bdev, mddev->gendisk); |
1923 | 2075 | ||
1924 | /* May as well allow recovery to be retried once */ | 2076 | /* May as well allow recovery to be retried once */ |
1925 | mddev->recovery_disabled = 0; | 2077 | mddev->recovery_disabled++; |
1926 | 2078 | ||
1927 | return 0; | 2079 | return 0; |
1928 | 2080 | ||
@@ -1953,6 +2105,9 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev) | |||
1953 | sysfs_remove_link(&rdev->kobj, "block"); | 2105 | sysfs_remove_link(&rdev->kobj, "block"); |
1954 | sysfs_put(rdev->sysfs_state); | 2106 | sysfs_put(rdev->sysfs_state); |
1955 | rdev->sysfs_state = NULL; | 2107 | rdev->sysfs_state = NULL; |
2108 | kfree(rdev->badblocks.page); | ||
2109 | rdev->badblocks.count = 0; | ||
2110 | rdev->badblocks.page = NULL; | ||
1956 | /* We need to delay this, otherwise we can deadlock when | 2111 | /* We need to delay this, otherwise we can deadlock when |
1957 | * writing to 'remove' to "dev/state". We also need | 2112 | * writing to 'remove' to "dev/state". We also need |
1958 | * to delay it due to rcu usage. | 2113 | * to delay it due to rcu usage. |
@@ -2127,10 +2282,10 @@ static void print_rdev(mdk_rdev_t *rdev, int major_version) | |||
2127 | printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version); | 2282 | printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version); |
2128 | switch (major_version) { | 2283 | switch (major_version) { |
2129 | case 0: | 2284 | case 0: |
2130 | print_sb_90((mdp_super_t*)page_address(rdev->sb_page)); | 2285 | print_sb_90(page_address(rdev->sb_page)); |
2131 | break; | 2286 | break; |
2132 | case 1: | 2287 | case 1: |
2133 | print_sb_1((struct mdp_superblock_1 *)page_address(rdev->sb_page)); | 2288 | print_sb_1(page_address(rdev->sb_page)); |
2134 | break; | 2289 | break; |
2135 | } | 2290 | } |
2136 | } else | 2291 | } else |
@@ -2194,6 +2349,7 @@ static void md_update_sb(mddev_t * mddev, int force_change) | |||
2194 | mdk_rdev_t *rdev; | 2349 | mdk_rdev_t *rdev; |
2195 | int sync_req; | 2350 | int sync_req; |
2196 | int nospares = 0; | 2351 | int nospares = 0; |
2352 | int any_badblocks_changed = 0; | ||
2197 | 2353 | ||
2198 | repeat: | 2354 | repeat: |
2199 | /* First make sure individual recovery_offsets are correct */ | 2355 | /* First make sure individual recovery_offsets are correct */ |
@@ -2208,8 +2364,18 @@ repeat: | |||
2208 | if (!mddev->persistent) { | 2364 | if (!mddev->persistent) { |
2209 | clear_bit(MD_CHANGE_CLEAN, &mddev->flags); | 2365 | clear_bit(MD_CHANGE_CLEAN, &mddev->flags); |
2210 | clear_bit(MD_CHANGE_DEVS, &mddev->flags); | 2366 | clear_bit(MD_CHANGE_DEVS, &mddev->flags); |
2211 | if (!mddev->external) | 2367 | if (!mddev->external) { |
2212 | clear_bit(MD_CHANGE_PENDING, &mddev->flags); | 2368 | clear_bit(MD_CHANGE_PENDING, &mddev->flags); |
2369 | list_for_each_entry(rdev, &mddev->disks, same_set) { | ||
2370 | if (rdev->badblocks.changed) { | ||
2371 | md_ack_all_badblocks(&rdev->badblocks); | ||
2372 | md_error(mddev, rdev); | ||
2373 | } | ||
2374 | clear_bit(Blocked, &rdev->flags); | ||
2375 | clear_bit(BlockedBadBlocks, &rdev->flags); | ||
2376 | wake_up(&rdev->blocked_wait); | ||
2377 | } | ||
2378 | } | ||
2213 | wake_up(&mddev->sb_wait); | 2379 | wake_up(&mddev->sb_wait); |
2214 | return; | 2380 | return; |
2215 | } | 2381 | } |
@@ -2265,6 +2431,14 @@ repeat: | |||
2265 | MD_BUG(); | 2431 | MD_BUG(); |
2266 | mddev->events --; | 2432 | mddev->events --; |
2267 | } | 2433 | } |
2434 | |||
2435 | list_for_each_entry(rdev, &mddev->disks, same_set) { | ||
2436 | if (rdev->badblocks.changed) | ||
2437 | any_badblocks_changed++; | ||
2438 | if (test_bit(Faulty, &rdev->flags)) | ||
2439 | set_bit(FaultRecorded, &rdev->flags); | ||
2440 | } | ||
2441 | |||
2268 | sync_sbs(mddev, nospares); | 2442 | sync_sbs(mddev, nospares); |
2269 | spin_unlock_irq(&mddev->write_lock); | 2443 | spin_unlock_irq(&mddev->write_lock); |
2270 | 2444 | ||
@@ -2290,6 +2464,13 @@ repeat: | |||
2290 | bdevname(rdev->bdev,b), | 2464 | bdevname(rdev->bdev,b), |
2291 | (unsigned long long)rdev->sb_start); | 2465 | (unsigned long long)rdev->sb_start); |
2292 | rdev->sb_events = mddev->events; | 2466 | rdev->sb_events = mddev->events; |
2467 | if (rdev->badblocks.size) { | ||
2468 | md_super_write(mddev, rdev, | ||
2469 | rdev->badblocks.sector, | ||
2470 | rdev->badblocks.size << 9, | ||
2471 | rdev->bb_page); | ||
2472 | rdev->badblocks.size = 0; | ||
2473 | } | ||
2293 | 2474 | ||
2294 | } else | 2475 | } else |
2295 | dprintk(")\n"); | 2476 | dprintk(")\n"); |
@@ -2313,6 +2494,15 @@ repeat: | |||
2313 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) | 2494 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) |
2314 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); | 2495 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); |
2315 | 2496 | ||
2497 | list_for_each_entry(rdev, &mddev->disks, same_set) { | ||
2498 | if (test_and_clear_bit(FaultRecorded, &rdev->flags)) | ||
2499 | clear_bit(Blocked, &rdev->flags); | ||
2500 | |||
2501 | if (any_badblocks_changed) | ||
2502 | md_ack_all_badblocks(&rdev->badblocks); | ||
2503 | clear_bit(BlockedBadBlocks, &rdev->flags); | ||
2504 | wake_up(&rdev->blocked_wait); | ||
2505 | } | ||
2316 | } | 2506 | } |
2317 | 2507 | ||
2318 | /* words written to sysfs files may, or may not, be \n terminated. | 2508 | /* words written to sysfs files may, or may not, be \n terminated. |
@@ -2347,7 +2537,8 @@ state_show(mdk_rdev_t *rdev, char *page) | |||
2347 | char *sep = ""; | 2537 | char *sep = ""; |
2348 | size_t len = 0; | 2538 | size_t len = 0; |
2349 | 2539 | ||
2350 | if (test_bit(Faulty, &rdev->flags)) { | 2540 | if (test_bit(Faulty, &rdev->flags) || |
2541 | rdev->badblocks.unacked_exist) { | ||
2351 | len+= sprintf(page+len, "%sfaulty",sep); | 2542 | len+= sprintf(page+len, "%sfaulty",sep); |
2352 | sep = ","; | 2543 | sep = ","; |
2353 | } | 2544 | } |
@@ -2359,7 +2550,8 @@ state_show(mdk_rdev_t *rdev, char *page) | |||
2359 | len += sprintf(page+len, "%swrite_mostly",sep); | 2550 | len += sprintf(page+len, "%swrite_mostly",sep); |
2360 | sep = ","; | 2551 | sep = ","; |
2361 | } | 2552 | } |
2362 | if (test_bit(Blocked, &rdev->flags)) { | 2553 | if (test_bit(Blocked, &rdev->flags) || |
2554 | rdev->badblocks.unacked_exist) { | ||
2363 | len += sprintf(page+len, "%sblocked", sep); | 2555 | len += sprintf(page+len, "%sblocked", sep); |
2364 | sep = ","; | 2556 | sep = ","; |
2365 | } | 2557 | } |
@@ -2368,6 +2560,10 @@ state_show(mdk_rdev_t *rdev, char *page) | |||
2368 | len += sprintf(page+len, "%sspare", sep); | 2560 | len += sprintf(page+len, "%sspare", sep); |
2369 | sep = ","; | 2561 | sep = ","; |
2370 | } | 2562 | } |
2563 | if (test_bit(WriteErrorSeen, &rdev->flags)) { | ||
2564 | len += sprintf(page+len, "%swrite_error", sep); | ||
2565 | sep = ","; | ||
2566 | } | ||
2371 | return len+sprintf(page+len, "\n"); | 2567 | return len+sprintf(page+len, "\n"); |
2372 | } | 2568 | } |
2373 | 2569 | ||
@@ -2375,18 +2571,23 @@ static ssize_t | |||
2375 | state_store(mdk_rdev_t *rdev, const char *buf, size_t len) | 2571 | state_store(mdk_rdev_t *rdev, const char *buf, size_t len) |
2376 | { | 2572 | { |
2377 | /* can write | 2573 | /* can write |
2378 | * faulty - simulates and error | 2574 | * faulty - simulates an error |
2379 | * remove - disconnects the device | 2575 | * remove - disconnects the device |
2380 | * writemostly - sets write_mostly | 2576 | * writemostly - sets write_mostly |
2381 | * -writemostly - clears write_mostly | 2577 | * -writemostly - clears write_mostly |
2382 | * blocked - sets the Blocked flag | 2578 | * blocked - sets the Blocked flags |
2383 | * -blocked - clears the Blocked flag | 2579 | * -blocked - clears the Blocked and possibly simulates an error |
2384 | * insync - sets Insync providing device isn't active | 2580 | * insync - sets Insync providing device isn't active |
2581 | * write_error - sets WriteErrorSeen | ||
2582 | * -write_error - clears WriteErrorSeen | ||
2385 | */ | 2583 | */ |
2386 | int err = -EINVAL; | 2584 | int err = -EINVAL; |
2387 | if (cmd_match(buf, "faulty") && rdev->mddev->pers) { | 2585 | if (cmd_match(buf, "faulty") && rdev->mddev->pers) { |
2388 | md_error(rdev->mddev, rdev); | 2586 | md_error(rdev->mddev, rdev); |
2389 | err = 0; | 2587 | if (test_bit(Faulty, &rdev->flags)) |
2588 | err = 0; | ||
2589 | else | ||
2590 | err = -EBUSY; | ||
2390 | } else if (cmd_match(buf, "remove")) { | 2591 | } else if (cmd_match(buf, "remove")) { |
2391 | if (rdev->raid_disk >= 0) | 2592 | if (rdev->raid_disk >= 0) |
2392 | err = -EBUSY; | 2593 | err = -EBUSY; |
@@ -2408,7 +2609,15 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
2408 | set_bit(Blocked, &rdev->flags); | 2609 | set_bit(Blocked, &rdev->flags); |
2409 | err = 0; | 2610 | err = 0; |
2410 | } else if (cmd_match(buf, "-blocked")) { | 2611 | } else if (cmd_match(buf, "-blocked")) { |
2612 | if (!test_bit(Faulty, &rdev->flags) && | ||
2613 | rdev->badblocks.unacked_exist) { | ||
2614 | /* metadata handler doesn't understand badblocks, | ||
2615 | * so we need to fail the device | ||
2616 | */ | ||
2617 | md_error(rdev->mddev, rdev); | ||
2618 | } | ||
2411 | clear_bit(Blocked, &rdev->flags); | 2619 | clear_bit(Blocked, &rdev->flags); |
2620 | clear_bit(BlockedBadBlocks, &rdev->flags); | ||
2412 | wake_up(&rdev->blocked_wait); | 2621 | wake_up(&rdev->blocked_wait); |
2413 | set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); | 2622 | set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); |
2414 | md_wakeup_thread(rdev->mddev->thread); | 2623 | md_wakeup_thread(rdev->mddev->thread); |
@@ -2417,6 +2626,12 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
2417 | } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { | 2626 | } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { |
2418 | set_bit(In_sync, &rdev->flags); | 2627 | set_bit(In_sync, &rdev->flags); |
2419 | err = 0; | 2628 | err = 0; |
2629 | } else if (cmd_match(buf, "write_error")) { | ||
2630 | set_bit(WriteErrorSeen, &rdev->flags); | ||
2631 | err = 0; | ||
2632 | } else if (cmd_match(buf, "-write_error")) { | ||
2633 | clear_bit(WriteErrorSeen, &rdev->flags); | ||
2634 | err = 0; | ||
2420 | } | 2635 | } |
2421 | if (!err) | 2636 | if (!err) |
2422 | sysfs_notify_dirent_safe(rdev->sysfs_state); | 2637 | sysfs_notify_dirent_safe(rdev->sysfs_state); |
@@ -2459,7 +2674,6 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
2459 | { | 2674 | { |
2460 | char *e; | 2675 | char *e; |
2461 | int err; | 2676 | int err; |
2462 | char nm[20]; | ||
2463 | int slot = simple_strtoul(buf, &e, 10); | 2677 | int slot = simple_strtoul(buf, &e, 10); |
2464 | if (strncmp(buf, "none", 4)==0) | 2678 | if (strncmp(buf, "none", 4)==0) |
2465 | slot = -1; | 2679 | slot = -1; |
@@ -2482,8 +2696,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
2482 | hot_remove_disk(rdev->mddev, rdev->raid_disk); | 2696 | hot_remove_disk(rdev->mddev, rdev->raid_disk); |
2483 | if (err) | 2697 | if (err) |
2484 | return err; | 2698 | return err; |
2485 | sprintf(nm, "rd%d", rdev->raid_disk); | 2699 | sysfs_unlink_rdev(rdev->mddev, rdev); |
2486 | sysfs_remove_link(&rdev->mddev->kobj, nm); | ||
2487 | rdev->raid_disk = -1; | 2700 | rdev->raid_disk = -1; |
2488 | set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); | 2701 | set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); |
2489 | md_wakeup_thread(rdev->mddev->thread); | 2702 | md_wakeup_thread(rdev->mddev->thread); |
@@ -2522,8 +2735,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
2522 | return err; | 2735 | return err; |
2523 | } else | 2736 | } else |
2524 | sysfs_notify_dirent_safe(rdev->sysfs_state); | 2737 | sysfs_notify_dirent_safe(rdev->sysfs_state); |
2525 | sprintf(nm, "rd%d", rdev->raid_disk); | 2738 | if (sysfs_link_rdev(rdev->mddev, rdev)) |
2526 | if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm)) | ||
2527 | /* failure here is OK */; | 2739 | /* failure here is OK */; |
2528 | /* don't wakeup anyone, leave that to userspace. */ | 2740 | /* don't wakeup anyone, leave that to userspace. */ |
2529 | } else { | 2741 | } else { |
@@ -2712,6 +2924,39 @@ static ssize_t recovery_start_store(mdk_rdev_t *rdev, const char *buf, size_t le | |||
2712 | static struct rdev_sysfs_entry rdev_recovery_start = | 2924 | static struct rdev_sysfs_entry rdev_recovery_start = |
2713 | __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); | 2925 | __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); |
2714 | 2926 | ||
2927 | |||
2928 | static ssize_t | ||
2929 | badblocks_show(struct badblocks *bb, char *page, int unack); | ||
2930 | static ssize_t | ||
2931 | badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack); | ||
2932 | |||
2933 | static ssize_t bb_show(mdk_rdev_t *rdev, char *page) | ||
2934 | { | ||
2935 | return badblocks_show(&rdev->badblocks, page, 0); | ||
2936 | } | ||
2937 | static ssize_t bb_store(mdk_rdev_t *rdev, const char *page, size_t len) | ||
2938 | { | ||
2939 | int rv = badblocks_store(&rdev->badblocks, page, len, 0); | ||
2940 | /* Maybe that ack was all we needed */ | ||
2941 | if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags)) | ||
2942 | wake_up(&rdev->blocked_wait); | ||
2943 | return rv; | ||
2944 | } | ||
2945 | static struct rdev_sysfs_entry rdev_bad_blocks = | ||
2946 | __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store); | ||
2947 | |||
2948 | |||
2949 | static ssize_t ubb_show(mdk_rdev_t *rdev, char *page) | ||
2950 | { | ||
2951 | return badblocks_show(&rdev->badblocks, page, 1); | ||
2952 | } | ||
2953 | static ssize_t ubb_store(mdk_rdev_t *rdev, const char *page, size_t len) | ||
2954 | { | ||
2955 | return badblocks_store(&rdev->badblocks, page, len, 1); | ||
2956 | } | ||
2957 | static struct rdev_sysfs_entry rdev_unack_bad_blocks = | ||
2958 | __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store); | ||
2959 | |||
2715 | static struct attribute *rdev_default_attrs[] = { | 2960 | static struct attribute *rdev_default_attrs[] = { |
2716 | &rdev_state.attr, | 2961 | &rdev_state.attr, |
2717 | &rdev_errors.attr, | 2962 | &rdev_errors.attr, |
@@ -2719,6 +2964,8 @@ static struct attribute *rdev_default_attrs[] = { | |||
2719 | &rdev_offset.attr, | 2964 | &rdev_offset.attr, |
2720 | &rdev_size.attr, | 2965 | &rdev_size.attr, |
2721 | &rdev_recovery_start.attr, | 2966 | &rdev_recovery_start.attr, |
2967 | &rdev_bad_blocks.attr, | ||
2968 | &rdev_unack_bad_blocks.attr, | ||
2722 | NULL, | 2969 | NULL, |
2723 | }; | 2970 | }; |
2724 | static ssize_t | 2971 | static ssize_t |
@@ -2782,7 +3029,7 @@ static struct kobj_type rdev_ktype = { | |||
2782 | .default_attrs = rdev_default_attrs, | 3029 | .default_attrs = rdev_default_attrs, |
2783 | }; | 3030 | }; |
2784 | 3031 | ||
2785 | void md_rdev_init(mdk_rdev_t *rdev) | 3032 | int md_rdev_init(mdk_rdev_t *rdev) |
2786 | { | 3033 | { |
2787 | rdev->desc_nr = -1; | 3034 | rdev->desc_nr = -1; |
2788 | rdev->saved_raid_disk = -1; | 3035 | rdev->saved_raid_disk = -1; |
@@ -2792,12 +3039,27 @@ void md_rdev_init(mdk_rdev_t *rdev) | |||
2792 | rdev->sb_events = 0; | 3039 | rdev->sb_events = 0; |
2793 | rdev->last_read_error.tv_sec = 0; | 3040 | rdev->last_read_error.tv_sec = 0; |
2794 | rdev->last_read_error.tv_nsec = 0; | 3041 | rdev->last_read_error.tv_nsec = 0; |
3042 | rdev->sb_loaded = 0; | ||
3043 | rdev->bb_page = NULL; | ||
2795 | atomic_set(&rdev->nr_pending, 0); | 3044 | atomic_set(&rdev->nr_pending, 0); |
2796 | atomic_set(&rdev->read_errors, 0); | 3045 | atomic_set(&rdev->read_errors, 0); |
2797 | atomic_set(&rdev->corrected_errors, 0); | 3046 | atomic_set(&rdev->corrected_errors, 0); |
2798 | 3047 | ||
2799 | INIT_LIST_HEAD(&rdev->same_set); | 3048 | INIT_LIST_HEAD(&rdev->same_set); |
2800 | init_waitqueue_head(&rdev->blocked_wait); | 3049 | init_waitqueue_head(&rdev->blocked_wait); |
3050 | |||
3051 | /* Add space to store bad block list. | ||
3052 | * This reserves the space even on arrays where it cannot | ||
3053 | * be used - I wonder if that matters | ||
3054 | */ | ||
3055 | rdev->badblocks.count = 0; | ||
3056 | rdev->badblocks.shift = 0; | ||
3057 | rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL); | ||
3058 | seqlock_init(&rdev->badblocks.lock); | ||
3059 | if (rdev->badblocks.page == NULL) | ||
3060 | return -ENOMEM; | ||
3061 | |||
3062 | return 0; | ||
2801 | } | 3063 | } |
2802 | EXPORT_SYMBOL_GPL(md_rdev_init); | 3064 | EXPORT_SYMBOL_GPL(md_rdev_init); |
2803 | /* | 3065 | /* |
@@ -2823,8 +3085,11 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi | |||
2823 | return ERR_PTR(-ENOMEM); | 3085 | return ERR_PTR(-ENOMEM); |
2824 | } | 3086 | } |
2825 | 3087 | ||
2826 | md_rdev_init(rdev); | 3088 | err = md_rdev_init(rdev); |
2827 | if ((err = alloc_disk_sb(rdev))) | 3089 | if (err) |
3090 | goto abort_free; | ||
3091 | err = alloc_disk_sb(rdev); | ||
3092 | if (err) | ||
2828 | goto abort_free; | 3093 | goto abort_free; |
2829 | 3094 | ||
2830 | err = lock_rdev(rdev, newdev, super_format == -2); | 3095 | err = lock_rdev(rdev, newdev, super_format == -2); |
@@ -2860,15 +3125,17 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi | |||
2860 | goto abort_free; | 3125 | goto abort_free; |
2861 | } | 3126 | } |
2862 | } | 3127 | } |
3128 | if (super_format == -1) | ||
3129 | /* hot-add for 0.90, or non-persistent: so no badblocks */ | ||
3130 | rdev->badblocks.shift = -1; | ||
2863 | 3131 | ||
2864 | return rdev; | 3132 | return rdev; |
2865 | 3133 | ||
2866 | abort_free: | 3134 | abort_free: |
2867 | if (rdev->sb_page) { | 3135 | if (rdev->bdev) |
2868 | if (rdev->bdev) | 3136 | unlock_rdev(rdev); |
2869 | unlock_rdev(rdev); | 3137 | free_disk_sb(rdev); |
2870 | free_disk_sb(rdev); | 3138 | kfree(rdev->badblocks.page); |
2871 | } | ||
2872 | kfree(rdev); | 3139 | kfree(rdev); |
2873 | return ERR_PTR(err); | 3140 | return ERR_PTR(err); |
2874 | } | 3141 | } |
@@ -3149,15 +3416,13 @@ level_store(mddev_t *mddev, const char *buf, size_t len) | |||
3149 | } | 3416 | } |
3150 | 3417 | ||
3151 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 3418 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
3152 | char nm[20]; | ||
3153 | if (rdev->raid_disk < 0) | 3419 | if (rdev->raid_disk < 0) |
3154 | continue; | 3420 | continue; |
3155 | if (rdev->new_raid_disk >= mddev->raid_disks) | 3421 | if (rdev->new_raid_disk >= mddev->raid_disks) |
3156 | rdev->new_raid_disk = -1; | 3422 | rdev->new_raid_disk = -1; |
3157 | if (rdev->new_raid_disk == rdev->raid_disk) | 3423 | if (rdev->new_raid_disk == rdev->raid_disk) |
3158 | continue; | 3424 | continue; |
3159 | sprintf(nm, "rd%d", rdev->raid_disk); | 3425 | sysfs_unlink_rdev(mddev, rdev); |
3160 | sysfs_remove_link(&mddev->kobj, nm); | ||
3161 | } | 3426 | } |
3162 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 3427 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
3163 | if (rdev->raid_disk < 0) | 3428 | if (rdev->raid_disk < 0) |
@@ -3168,11 +3433,10 @@ level_store(mddev_t *mddev, const char *buf, size_t len) | |||
3168 | if (rdev->raid_disk < 0) | 3433 | if (rdev->raid_disk < 0) |
3169 | clear_bit(In_sync, &rdev->flags); | 3434 | clear_bit(In_sync, &rdev->flags); |
3170 | else { | 3435 | else { |
3171 | char nm[20]; | 3436 | if (sysfs_link_rdev(mddev, rdev)) |
3172 | sprintf(nm, "rd%d", rdev->raid_disk); | 3437 | printk(KERN_WARNING "md: cannot register rd%d" |
3173 | if(sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) | 3438 | " for %s after level change\n", |
3174 | printk("md: cannot register %s for %s after level change\n", | 3439 | rdev->raid_disk, mdname(mddev)); |
3175 | nm, mdname(mddev)); | ||
3176 | } | 3440 | } |
3177 | } | 3441 | } |
3178 | 3442 | ||
@@ -4504,7 +4768,8 @@ int md_run(mddev_t *mddev) | |||
4504 | } | 4768 | } |
4505 | 4769 | ||
4506 | if (mddev->bio_set == NULL) | 4770 | if (mddev->bio_set == NULL) |
4507 | mddev->bio_set = bioset_create(BIO_POOL_SIZE, sizeof(mddev)); | 4771 | mddev->bio_set = bioset_create(BIO_POOL_SIZE, |
4772 | sizeof(mddev_t *)); | ||
4508 | 4773 | ||
4509 | spin_lock(&pers_lock); | 4774 | spin_lock(&pers_lock); |
4510 | pers = find_pers(mddev->level, mddev->clevel); | 4775 | pers = find_pers(mddev->level, mddev->clevel); |
@@ -4621,12 +4886,9 @@ int md_run(mddev_t *mddev) | |||
4621 | smp_wmb(); | 4886 | smp_wmb(); |
4622 | mddev->ready = 1; | 4887 | mddev->ready = 1; |
4623 | list_for_each_entry(rdev, &mddev->disks, same_set) | 4888 | list_for_each_entry(rdev, &mddev->disks, same_set) |
4624 | if (rdev->raid_disk >= 0) { | 4889 | if (rdev->raid_disk >= 0) |
4625 | char nm[20]; | 4890 | if (sysfs_link_rdev(mddev, rdev)) |
4626 | sprintf(nm, "rd%d", rdev->raid_disk); | ||
4627 | if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) | ||
4628 | /* failure here is OK */; | 4891 | /* failure here is OK */; |
4629 | } | ||
4630 | 4892 | ||
4631 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 4893 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
4632 | 4894 | ||
@@ -4854,11 +5116,8 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) | |||
4854 | sysfs_notify_dirent_safe(mddev->sysfs_state); | 5116 | sysfs_notify_dirent_safe(mddev->sysfs_state); |
4855 | 5117 | ||
4856 | list_for_each_entry(rdev, &mddev->disks, same_set) | 5118 | list_for_each_entry(rdev, &mddev->disks, same_set) |
4857 | if (rdev->raid_disk >= 0) { | 5119 | if (rdev->raid_disk >= 0) |
4858 | char nm[20]; | 5120 | sysfs_unlink_rdev(mddev, rdev); |
4859 | sprintf(nm, "rd%d", rdev->raid_disk); | ||
4860 | sysfs_remove_link(&mddev->kobj, nm); | ||
4861 | } | ||
4862 | 5121 | ||
4863 | set_capacity(disk, 0); | 5122 | set_capacity(disk, 0); |
4864 | mutex_unlock(&mddev->open_mutex); | 5123 | mutex_unlock(&mddev->open_mutex); |
@@ -5750,6 +6009,8 @@ static int set_disk_faulty(mddev_t *mddev, dev_t dev) | |||
5750 | return -ENODEV; | 6009 | return -ENODEV; |
5751 | 6010 | ||
5752 | md_error(mddev, rdev); | 6011 | md_error(mddev, rdev); |
6012 | if (!test_bit(Faulty, &rdev->flags)) | ||
6013 | return -EBUSY; | ||
5753 | return 0; | 6014 | return 0; |
5754 | } | 6015 | } |
5755 | 6016 | ||
@@ -6178,11 +6439,18 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, | |||
6178 | return thread; | 6439 | return thread; |
6179 | } | 6440 | } |
6180 | 6441 | ||
6181 | void md_unregister_thread(mdk_thread_t *thread) | 6442 | void md_unregister_thread(mdk_thread_t **threadp) |
6182 | { | 6443 | { |
6444 | mdk_thread_t *thread = *threadp; | ||
6183 | if (!thread) | 6445 | if (!thread) |
6184 | return; | 6446 | return; |
6185 | dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); | 6447 | dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); |
6448 | /* Locking ensures that mddev_unlock does not wake_up a | ||
6449 | * non-existent thread | ||
6450 | */ | ||
6451 | spin_lock(&pers_lock); | ||
6452 | *threadp = NULL; | ||
6453 | spin_unlock(&pers_lock); | ||
6186 | 6454 | ||
6187 | kthread_stop(thread->tsk); | 6455 | kthread_stop(thread->tsk); |
6188 | kfree(thread); | 6456 | kfree(thread); |
@@ -6198,18 +6466,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev) | |||
6198 | if (!rdev || test_bit(Faulty, &rdev->flags)) | 6466 | if (!rdev || test_bit(Faulty, &rdev->flags)) |
6199 | return; | 6467 | return; |
6200 | 6468 | ||
6201 | if (mddev->external) | 6469 | if (!mddev->pers || !mddev->pers->error_handler) |
6202 | set_bit(Blocked, &rdev->flags); | ||
6203 | /* | ||
6204 | dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", | ||
6205 | mdname(mddev), | ||
6206 | MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), | ||
6207 | __builtin_return_address(0),__builtin_return_address(1), | ||
6208 | __builtin_return_address(2),__builtin_return_address(3)); | ||
6209 | */ | ||
6210 | if (!mddev->pers) | ||
6211 | return; | ||
6212 | if (!mddev->pers->error_handler) | ||
6213 | return; | 6470 | return; |
6214 | mddev->pers->error_handler(mddev,rdev); | 6471 | mddev->pers->error_handler(mddev,rdev); |
6215 | if (mddev->degraded) | 6472 | if (mddev->degraded) |
@@ -6394,16 +6651,11 @@ static void md_seq_stop(struct seq_file *seq, void *v) | |||
6394 | mddev_put(mddev); | 6651 | mddev_put(mddev); |
6395 | } | 6652 | } |
6396 | 6653 | ||
6397 | struct mdstat_info { | ||
6398 | int event; | ||
6399 | }; | ||
6400 | |||
6401 | static int md_seq_show(struct seq_file *seq, void *v) | 6654 | static int md_seq_show(struct seq_file *seq, void *v) |
6402 | { | 6655 | { |
6403 | mddev_t *mddev = v; | 6656 | mddev_t *mddev = v; |
6404 | sector_t sectors; | 6657 | sector_t sectors; |
6405 | mdk_rdev_t *rdev; | 6658 | mdk_rdev_t *rdev; |
6406 | struct mdstat_info *mi = seq->private; | ||
6407 | struct bitmap *bitmap; | 6659 | struct bitmap *bitmap; |
6408 | 6660 | ||
6409 | if (v == (void*)1) { | 6661 | if (v == (void*)1) { |
@@ -6415,7 +6667,7 @@ static int md_seq_show(struct seq_file *seq, void *v) | |||
6415 | 6667 | ||
6416 | spin_unlock(&pers_lock); | 6668 | spin_unlock(&pers_lock); |
6417 | seq_printf(seq, "\n"); | 6669 | seq_printf(seq, "\n"); |
6418 | mi->event = atomic_read(&md_event_count); | 6670 | seq->poll_event = atomic_read(&md_event_count); |
6419 | return 0; | 6671 | return 0; |
6420 | } | 6672 | } |
6421 | if (v == (void*)2) { | 6673 | if (v == (void*)2) { |
@@ -6527,26 +6779,21 @@ static const struct seq_operations md_seq_ops = { | |||
6527 | 6779 | ||
6528 | static int md_seq_open(struct inode *inode, struct file *file) | 6780 | static int md_seq_open(struct inode *inode, struct file *file) |
6529 | { | 6781 | { |
6782 | struct seq_file *seq; | ||
6530 | int error; | 6783 | int error; |
6531 | struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL); | ||
6532 | if (mi == NULL) | ||
6533 | return -ENOMEM; | ||
6534 | 6784 | ||
6535 | error = seq_open(file, &md_seq_ops); | 6785 | error = seq_open(file, &md_seq_ops); |
6536 | if (error) | 6786 | if (error) |
6537 | kfree(mi); | 6787 | return error; |
6538 | else { | 6788 | |
6539 | struct seq_file *p = file->private_data; | 6789 | seq = file->private_data; |
6540 | p->private = mi; | 6790 | seq->poll_event = atomic_read(&md_event_count); |
6541 | mi->event = atomic_read(&md_event_count); | ||
6542 | } | ||
6543 | return error; | 6791 | return error; |
6544 | } | 6792 | } |
6545 | 6793 | ||
6546 | static unsigned int mdstat_poll(struct file *filp, poll_table *wait) | 6794 | static unsigned int mdstat_poll(struct file *filp, poll_table *wait) |
6547 | { | 6795 | { |
6548 | struct seq_file *m = filp->private_data; | 6796 | struct seq_file *seq = filp->private_data; |
6549 | struct mdstat_info *mi = m->private; | ||
6550 | int mask; | 6797 | int mask; |
6551 | 6798 | ||
6552 | poll_wait(filp, &md_event_waiters, wait); | 6799 | poll_wait(filp, &md_event_waiters, wait); |
@@ -6554,7 +6801,7 @@ static unsigned int mdstat_poll(struct file *filp, poll_table *wait) | |||
6554 | /* always allow read */ | 6801 | /* always allow read */ |
6555 | mask = POLLIN | POLLRDNORM; | 6802 | mask = POLLIN | POLLRDNORM; |
6556 | 6803 | ||
6557 | if (mi->event != atomic_read(&md_event_count)) | 6804 | if (seq->poll_event != atomic_read(&md_event_count)) |
6558 | mask |= POLLERR | POLLPRI; | 6805 | mask |= POLLERR | POLLPRI; |
6559 | return mask; | 6806 | return mask; |
6560 | } | 6807 | } |
@@ -6943,11 +7190,14 @@ void md_do_sync(mddev_t *mddev) | |||
6943 | atomic_add(sectors, &mddev->recovery_active); | 7190 | atomic_add(sectors, &mddev->recovery_active); |
6944 | } | 7191 | } |
6945 | 7192 | ||
7193 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) | ||
7194 | break; | ||
7195 | |||
6946 | j += sectors; | 7196 | j += sectors; |
6947 | if (j>1) mddev->curr_resync = j; | 7197 | if (j>1) mddev->curr_resync = j; |
6948 | mddev->curr_mark_cnt = io_sectors; | 7198 | mddev->curr_mark_cnt = io_sectors; |
6949 | if (last_check == 0) | 7199 | if (last_check == 0) |
6950 | /* this is the earliers that rebuilt will be | 7200 | /* this is the earliest that rebuild will be |
6951 | * visible in /proc/mdstat | 7201 | * visible in /proc/mdstat |
6952 | */ | 7202 | */ |
6953 | md_new_event(mddev); | 7203 | md_new_event(mddev); |
@@ -6956,10 +7206,6 @@ void md_do_sync(mddev_t *mddev) | |||
6956 | continue; | 7206 | continue; |
6957 | 7207 | ||
6958 | last_check = io_sectors; | 7208 | last_check = io_sectors; |
6959 | |||
6960 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) | ||
6961 | break; | ||
6962 | |||
6963 | repeat: | 7209 | repeat: |
6964 | if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { | 7210 | if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { |
6965 | /* step marks */ | 7211 | /* step marks */ |
@@ -7077,29 +7323,23 @@ static int remove_and_add_spares(mddev_t *mddev) | |||
7077 | atomic_read(&rdev->nr_pending)==0) { | 7323 | atomic_read(&rdev->nr_pending)==0) { |
7078 | if (mddev->pers->hot_remove_disk( | 7324 | if (mddev->pers->hot_remove_disk( |
7079 | mddev, rdev->raid_disk)==0) { | 7325 | mddev, rdev->raid_disk)==0) { |
7080 | char nm[20]; | 7326 | sysfs_unlink_rdev(mddev, rdev); |
7081 | sprintf(nm,"rd%d", rdev->raid_disk); | ||
7082 | sysfs_remove_link(&mddev->kobj, nm); | ||
7083 | rdev->raid_disk = -1; | 7327 | rdev->raid_disk = -1; |
7084 | } | 7328 | } |
7085 | } | 7329 | } |
7086 | 7330 | ||
7087 | if (mddev->degraded && !mddev->recovery_disabled) { | 7331 | if (mddev->degraded) { |
7088 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 7332 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
7089 | if (rdev->raid_disk >= 0 && | 7333 | if (rdev->raid_disk >= 0 && |
7090 | !test_bit(In_sync, &rdev->flags) && | 7334 | !test_bit(In_sync, &rdev->flags) && |
7091 | !test_bit(Faulty, &rdev->flags) && | 7335 | !test_bit(Faulty, &rdev->flags)) |
7092 | !test_bit(Blocked, &rdev->flags)) | ||
7093 | spares++; | 7336 | spares++; |
7094 | if (rdev->raid_disk < 0 | 7337 | if (rdev->raid_disk < 0 |
7095 | && !test_bit(Faulty, &rdev->flags)) { | 7338 | && !test_bit(Faulty, &rdev->flags)) { |
7096 | rdev->recovery_offset = 0; | 7339 | rdev->recovery_offset = 0; |
7097 | if (mddev->pers-> | 7340 | if (mddev->pers-> |
7098 | hot_add_disk(mddev, rdev) == 0) { | 7341 | hot_add_disk(mddev, rdev) == 0) { |
7099 | char nm[20]; | 7342 | if (sysfs_link_rdev(mddev, rdev)) |
7100 | sprintf(nm, "rd%d", rdev->raid_disk); | ||
7101 | if (sysfs_create_link(&mddev->kobj, | ||
7102 | &rdev->kobj, nm)) | ||
7103 | /* failure here is OK */; | 7343 | /* failure here is OK */; |
7104 | spares++; | 7344 | spares++; |
7105 | md_new_event(mddev); | 7345 | md_new_event(mddev); |
@@ -7117,8 +7357,7 @@ static void reap_sync_thread(mddev_t *mddev) | |||
7117 | mdk_rdev_t *rdev; | 7357 | mdk_rdev_t *rdev; |
7118 | 7358 | ||
7119 | /* resync has finished, collect result */ | 7359 | /* resync has finished, collect result */ |
7120 | md_unregister_thread(mddev->sync_thread); | 7360 | md_unregister_thread(&mddev->sync_thread); |
7121 | mddev->sync_thread = NULL; | ||
7122 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && | 7361 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && |
7123 | !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { | 7362 | !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { |
7124 | /* success...*/ | 7363 | /* success...*/ |
@@ -7148,6 +7387,8 @@ static void reap_sync_thread(mddev_t *mddev) | |||
7148 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 7387 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
7149 | sysfs_notify_dirent_safe(mddev->sysfs_action); | 7388 | sysfs_notify_dirent_safe(mddev->sysfs_action); |
7150 | md_new_event(mddev); | 7389 | md_new_event(mddev); |
7390 | if (mddev->event_work.func) | ||
7391 | queue_work(md_misc_wq, &mddev->event_work); | ||
7151 | } | 7392 | } |
7152 | 7393 | ||
7153 | /* | 7394 | /* |
@@ -7180,9 +7421,6 @@ void md_check_recovery(mddev_t *mddev) | |||
7180 | if (mddev->bitmap) | 7421 | if (mddev->bitmap) |
7181 | bitmap_daemon_work(mddev); | 7422 | bitmap_daemon_work(mddev); |
7182 | 7423 | ||
7183 | if (mddev->ro) | ||
7184 | return; | ||
7185 | |||
7186 | if (signal_pending(current)) { | 7424 | if (signal_pending(current)) { |
7187 | if (mddev->pers->sync_request && !mddev->external) { | 7425 | if (mddev->pers->sync_request && !mddev->external) { |
7188 | printk(KERN_INFO "md: %s in immediate safe mode\n", | 7426 | printk(KERN_INFO "md: %s in immediate safe mode\n", |
@@ -7219,9 +7457,7 @@ void md_check_recovery(mddev_t *mddev) | |||
7219 | atomic_read(&rdev->nr_pending)==0) { | 7457 | atomic_read(&rdev->nr_pending)==0) { |
7220 | if (mddev->pers->hot_remove_disk( | 7458 | if (mddev->pers->hot_remove_disk( |
7221 | mddev, rdev->raid_disk)==0) { | 7459 | mddev, rdev->raid_disk)==0) { |
7222 | char nm[20]; | 7460 | sysfs_unlink_rdev(mddev, rdev); |
7223 | sprintf(nm,"rd%d", rdev->raid_disk); | ||
7224 | sysfs_remove_link(&mddev->kobj, nm); | ||
7225 | rdev->raid_disk = -1; | 7461 | rdev->raid_disk = -1; |
7226 | } | 7462 | } |
7227 | } | 7463 | } |
@@ -7341,12 +7577,499 @@ void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev) | |||
7341 | { | 7577 | { |
7342 | sysfs_notify_dirent_safe(rdev->sysfs_state); | 7578 | sysfs_notify_dirent_safe(rdev->sysfs_state); |
7343 | wait_event_timeout(rdev->blocked_wait, | 7579 | wait_event_timeout(rdev->blocked_wait, |
7344 | !test_bit(Blocked, &rdev->flags), | 7580 | !test_bit(Blocked, &rdev->flags) && |
7581 | !test_bit(BlockedBadBlocks, &rdev->flags), | ||
7345 | msecs_to_jiffies(5000)); | 7582 | msecs_to_jiffies(5000)); |
7346 | rdev_dec_pending(rdev, mddev); | 7583 | rdev_dec_pending(rdev, mddev); |
7347 | } | 7584 | } |
7348 | EXPORT_SYMBOL(md_wait_for_blocked_rdev); | 7585 | EXPORT_SYMBOL(md_wait_for_blocked_rdev); |
7349 | 7586 | ||
7587 | |||
7588 | /* Bad block management. | ||
7589 | * We can record which blocks on each device are 'bad' and so just | ||
7590 | * fail those blocks, or that stripe, rather than the whole device. | ||
7591 | * Entries in the bad-block table are 64bits wide. This comprises: | ||
7592 | * Length of bad-range, in sectors: 0-511 for lengths 1-512 | ||
7593 | * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes) | ||
7594 | * A 'shift' can be set so that larger blocks are tracked and | ||
7595 | * consequently larger devices can be covered. | ||
7596 | * 'Acknowledged' flag - 1 bit. - the most significant bit. | ||
7597 | * | ||
7598 | * Locking of the bad-block table uses a seqlock so md_is_badblock | ||
7599 | * might need to retry if it is very unlucky. | ||
7600 | * We will sometimes want to check for bad blocks in a bi_end_io function, | ||
7601 | * so we use the write_seqlock_irq variant. | ||
7602 | * | ||
7603 | * When looking for a bad block we specify a range and want to | ||
7604 | * know if any block in the range is bad. So we binary-search | ||
7605 | * to the last range that starts at-or-before the given endpoint, | ||
7606 | * (or "before the sector after the target range") | ||
7607 | * then see if it ends after the given start. | ||
7608 | * We return | ||
7609 | * 0 if there are no known bad blocks in the range | ||
7610 | * 1 if there are known bad block which are all acknowledged | ||
7611 | * -1 if there are bad blocks which have not yet been acknowledged in metadata. | ||
7612 | * plus the start/length of the first bad section we overlap. | ||
7613 | */ | ||
7614 | int md_is_badblock(struct badblocks *bb, sector_t s, int sectors, | ||
7615 | sector_t *first_bad, int *bad_sectors) | ||
7616 | { | ||
7617 | int hi; | ||
7618 | int lo = 0; | ||
7619 | u64 *p = bb->page; | ||
7620 | int rv = 0; | ||
7621 | sector_t target = s + sectors; | ||
7622 | unsigned seq; | ||
7623 | |||
7624 | if (bb->shift > 0) { | ||
7625 | /* round the start down, and the end up */ | ||
7626 | s >>= bb->shift; | ||
7627 | target += (1<<bb->shift) - 1; | ||
7628 | target >>= bb->shift; | ||
7629 | sectors = target - s; | ||
7630 | } | ||
7631 | /* 'target' is now the first block after the bad range */ | ||
7632 | |||
7633 | retry: | ||
7634 | seq = read_seqbegin(&bb->lock); | ||
7635 | |||
7636 | hi = bb->count; | ||
7637 | |||
7638 | /* Binary search between lo and hi for 'target' | ||
7639 | * i.e. for the last range that starts before 'target' | ||
7640 | */ | ||
7641 | /* INVARIANT: ranges before 'lo' and at-or-after 'hi' | ||
7642 | * are known not to be the last range before target. | ||
7643 | * VARIANT: hi-lo is the number of possible | ||
7644 | * ranges, and decreases until it reaches 1 | ||
7645 | */ | ||
7646 | while (hi - lo > 1) { | ||
7647 | int mid = (lo + hi) / 2; | ||
7648 | sector_t a = BB_OFFSET(p[mid]); | ||
7649 | if (a < target) | ||
7650 | /* This could still be the one, earlier ranges | ||
7651 | * could not. */ | ||
7652 | lo = mid; | ||
7653 | else | ||
7654 | /* This and later ranges are definitely out. */ | ||
7655 | hi = mid; | ||
7656 | } | ||
7657 | /* 'lo' might be the last that started before target, but 'hi' isn't */ | ||
7658 | if (hi > lo) { | ||
7659 | /* need to check all range that end after 's' to see if | ||
7660 | * any are unacknowledged. | ||
7661 | */ | ||
7662 | while (lo >= 0 && | ||
7663 | BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { | ||
7664 | if (BB_OFFSET(p[lo]) < target) { | ||
7665 | /* starts before the end, and finishes after | ||
7666 | * the start, so they must overlap | ||
7667 | */ | ||
7668 | if (rv != -1 && BB_ACK(p[lo])) | ||
7669 | rv = 1; | ||
7670 | else | ||
7671 | rv = -1; | ||
7672 | *first_bad = BB_OFFSET(p[lo]); | ||
7673 | *bad_sectors = BB_LEN(p[lo]); | ||
7674 | } | ||
7675 | lo--; | ||
7676 | } | ||
7677 | } | ||
7678 | |||
7679 | if (read_seqretry(&bb->lock, seq)) | ||
7680 | goto retry; | ||
7681 | |||
7682 | return rv; | ||
7683 | } | ||
7684 | EXPORT_SYMBOL_GPL(md_is_badblock); | ||
7685 | |||
7686 | /* | ||
7687 | * Add a range of bad blocks to the table. | ||
7688 | * This might extend the table, or might contract it | ||
7689 | * if two adjacent ranges can be merged. | ||
7690 | * We binary-search to find the 'insertion' point, then | ||
7691 | * decide how best to handle it. | ||
7692 | */ | ||
7693 | static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, | ||
7694 | int acknowledged) | ||
7695 | { | ||
7696 | u64 *p; | ||
7697 | int lo, hi; | ||
7698 | int rv = 1; | ||
7699 | |||
7700 | if (bb->shift < 0) | ||
7701 | /* badblocks are disabled */ | ||
7702 | return 0; | ||
7703 | |||
7704 | if (bb->shift) { | ||
7705 | /* round the start down, and the end up */ | ||
7706 | sector_t next = s + sectors; | ||
7707 | s >>= bb->shift; | ||
7708 | next += (1<<bb->shift) - 1; | ||
7709 | next >>= bb->shift; | ||
7710 | sectors = next - s; | ||
7711 | } | ||
7712 | |||
7713 | write_seqlock_irq(&bb->lock); | ||
7714 | |||
7715 | p = bb->page; | ||
7716 | lo = 0; | ||
7717 | hi = bb->count; | ||
7718 | /* Find the last range that starts at-or-before 's' */ | ||
7719 | while (hi - lo > 1) { | ||
7720 | int mid = (lo + hi) / 2; | ||
7721 | sector_t a = BB_OFFSET(p[mid]); | ||
7722 | if (a <= s) | ||
7723 | lo = mid; | ||
7724 | else | ||
7725 | hi = mid; | ||
7726 | } | ||
7727 | if (hi > lo && BB_OFFSET(p[lo]) > s) | ||
7728 | hi = lo; | ||
7729 | |||
7730 | if (hi > lo) { | ||
7731 | /* we found a range that might merge with the start | ||
7732 | * of our new range | ||
7733 | */ | ||
7734 | sector_t a = BB_OFFSET(p[lo]); | ||
7735 | sector_t e = a + BB_LEN(p[lo]); | ||
7736 | int ack = BB_ACK(p[lo]); | ||
7737 | if (e >= s) { | ||
7738 | /* Yes, we can merge with a previous range */ | ||
7739 | if (s == a && s + sectors >= e) | ||
7740 | /* new range covers old */ | ||
7741 | ack = acknowledged; | ||
7742 | else | ||
7743 | ack = ack && acknowledged; | ||
7744 | |||
7745 | if (e < s + sectors) | ||
7746 | e = s + sectors; | ||
7747 | if (e - a <= BB_MAX_LEN) { | ||
7748 | p[lo] = BB_MAKE(a, e-a, ack); | ||
7749 | s = e; | ||
7750 | } else { | ||
7751 | /* does not all fit in one range, | ||
7752 | * make p[lo] maximal | ||
7753 | */ | ||
7754 | if (BB_LEN(p[lo]) != BB_MAX_LEN) | ||
7755 | p[lo] = BB_MAKE(a, BB_MAX_LEN, ack); | ||
7756 | s = a + BB_MAX_LEN; | ||
7757 | } | ||
7758 | sectors = e - s; | ||
7759 | } | ||
7760 | } | ||
7761 | if (sectors && hi < bb->count) { | ||
7762 | /* 'hi' points to the first range that starts after 's'. | ||
7763 | * Maybe we can merge with the start of that range */ | ||
7764 | sector_t a = BB_OFFSET(p[hi]); | ||
7765 | sector_t e = a + BB_LEN(p[hi]); | ||
7766 | int ack = BB_ACK(p[hi]); | ||
7767 | if (a <= s + sectors) { | ||
7768 | /* merging is possible */ | ||
7769 | if (e <= s + sectors) { | ||
7770 | /* full overlap */ | ||
7771 | e = s + sectors; | ||
7772 | ack = acknowledged; | ||
7773 | } else | ||
7774 | ack = ack && acknowledged; | ||
7775 | |||
7776 | a = s; | ||
7777 | if (e - a <= BB_MAX_LEN) { | ||
7778 | p[hi] = BB_MAKE(a, e-a, ack); | ||
7779 | s = e; | ||
7780 | } else { | ||
7781 | p[hi] = BB_MAKE(a, BB_MAX_LEN, ack); | ||
7782 | s = a + BB_MAX_LEN; | ||
7783 | } | ||
7784 | sectors = e - s; | ||
7785 | lo = hi; | ||
7786 | hi++; | ||
7787 | } | ||
7788 | } | ||
7789 | if (sectors == 0 && hi < bb->count) { | ||
7790 | /* we might be able to combine lo and hi */ | ||
7791 | /* Note: 's' is at the end of 'lo' */ | ||
7792 | sector_t a = BB_OFFSET(p[hi]); | ||
7793 | int lolen = BB_LEN(p[lo]); | ||
7794 | int hilen = BB_LEN(p[hi]); | ||
7795 | int newlen = lolen + hilen - (s - a); | ||
7796 | if (s >= a && newlen < BB_MAX_LEN) { | ||
7797 | /* yes, we can combine them */ | ||
7798 | int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]); | ||
7799 | p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack); | ||
7800 | memmove(p + hi, p + hi + 1, | ||
7801 | (bb->count - hi - 1) * 8); | ||
7802 | bb->count--; | ||
7803 | } | ||
7804 | } | ||
7805 | while (sectors) { | ||
7806 | /* didn't merge (it all). | ||
7807 | * Need to add a range just before 'hi' */ | ||
7808 | if (bb->count >= MD_MAX_BADBLOCKS) { | ||
7809 | /* No room for more */ | ||
7810 | rv = 0; | ||
7811 | break; | ||
7812 | } else { | ||
7813 | int this_sectors = sectors; | ||
7814 | memmove(p + hi + 1, p + hi, | ||
7815 | (bb->count - hi) * 8); | ||
7816 | bb->count++; | ||
7817 | |||
7818 | if (this_sectors > BB_MAX_LEN) | ||
7819 | this_sectors = BB_MAX_LEN; | ||
7820 | p[hi] = BB_MAKE(s, this_sectors, acknowledged); | ||
7821 | sectors -= this_sectors; | ||
7822 | s += this_sectors; | ||
7823 | } | ||
7824 | } | ||
7825 | |||
7826 | bb->changed = 1; | ||
7827 | if (!acknowledged) | ||
7828 | bb->unacked_exist = 1; | ||
7829 | write_sequnlock_irq(&bb->lock); | ||
7830 | |||
7831 | return rv; | ||
7832 | } | ||
7833 | |||
7834 | int rdev_set_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors, | ||
7835 | int acknowledged) | ||
7836 | { | ||
7837 | int rv = md_set_badblocks(&rdev->badblocks, | ||
7838 | s + rdev->data_offset, sectors, acknowledged); | ||
7839 | if (rv) { | ||
7840 | /* Make sure they get written out promptly */ | ||
7841 | set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags); | ||
7842 | md_wakeup_thread(rdev->mddev->thread); | ||
7843 | } | ||
7844 | return rv; | ||
7845 | } | ||
7846 | EXPORT_SYMBOL_GPL(rdev_set_badblocks); | ||
7847 | |||
7848 | /* | ||
7849 | * Remove a range of bad blocks from the table. | ||
7850 | * This may involve extending the table if we spilt a region, | ||
7851 | * but it must not fail. So if the table becomes full, we just | ||
7852 | * drop the remove request. | ||
7853 | */ | ||
7854 | static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors) | ||
7855 | { | ||
7856 | u64 *p; | ||
7857 | int lo, hi; | ||
7858 | sector_t target = s + sectors; | ||
7859 | int rv = 0; | ||
7860 | |||
7861 | if (bb->shift > 0) { | ||
7862 | /* When clearing we round the start up and the end down. | ||
7863 | * This should not matter as the shift should align with | ||
7864 | * the block size and no rounding should ever be needed. | ||
7865 | * However it is better the think a block is bad when it | ||
7866 | * isn't than to think a block is not bad when it is. | ||
7867 | */ | ||
7868 | s += (1<<bb->shift) - 1; | ||
7869 | s >>= bb->shift; | ||
7870 | target >>= bb->shift; | ||
7871 | sectors = target - s; | ||
7872 | } | ||
7873 | |||
7874 | write_seqlock_irq(&bb->lock); | ||
7875 | |||
7876 | p = bb->page; | ||
7877 | lo = 0; | ||
7878 | hi = bb->count; | ||
7879 | /* Find the last range that starts before 'target' */ | ||
7880 | while (hi - lo > 1) { | ||
7881 | int mid = (lo + hi) / 2; | ||
7882 | sector_t a = BB_OFFSET(p[mid]); | ||
7883 | if (a < target) | ||
7884 | lo = mid; | ||
7885 | else | ||
7886 | hi = mid; | ||
7887 | } | ||
7888 | if (hi > lo) { | ||
7889 | /* p[lo] is the last range that could overlap the | ||
7890 | * current range. Earlier ranges could also overlap, | ||
7891 | * but only this one can overlap the end of the range. | ||
7892 | */ | ||
7893 | if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) { | ||
7894 | /* Partial overlap, leave the tail of this range */ | ||
7895 | int ack = BB_ACK(p[lo]); | ||
7896 | sector_t a = BB_OFFSET(p[lo]); | ||
7897 | sector_t end = a + BB_LEN(p[lo]); | ||
7898 | |||
7899 | if (a < s) { | ||
7900 | /* we need to split this range */ | ||
7901 | if (bb->count >= MD_MAX_BADBLOCKS) { | ||
7902 | rv = 0; | ||
7903 | goto out; | ||
7904 | } | ||
7905 | memmove(p+lo+1, p+lo, (bb->count - lo) * 8); | ||
7906 | bb->count++; | ||
7907 | p[lo] = BB_MAKE(a, s-a, ack); | ||
7908 | lo++; | ||
7909 | } | ||
7910 | p[lo] = BB_MAKE(target, end - target, ack); | ||
7911 | /* there is no longer an overlap */ | ||
7912 | hi = lo; | ||
7913 | lo--; | ||
7914 | } | ||
7915 | while (lo >= 0 && | ||
7916 | BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { | ||
7917 | /* This range does overlap */ | ||
7918 | if (BB_OFFSET(p[lo]) < s) { | ||
7919 | /* Keep the early parts of this range. */ | ||
7920 | int ack = BB_ACK(p[lo]); | ||
7921 | sector_t start = BB_OFFSET(p[lo]); | ||
7922 | p[lo] = BB_MAKE(start, s - start, ack); | ||
7923 | /* now low doesn't overlap, so.. */ | ||
7924 | break; | ||
7925 | } | ||
7926 | lo--; | ||
7927 | } | ||
7928 | /* 'lo' is strictly before, 'hi' is strictly after, | ||
7929 | * anything between needs to be discarded | ||
7930 | */ | ||
7931 | if (hi - lo > 1) { | ||
7932 | memmove(p+lo+1, p+hi, (bb->count - hi) * 8); | ||
7933 | bb->count -= (hi - lo - 1); | ||
7934 | } | ||
7935 | } | ||
7936 | |||
7937 | bb->changed = 1; | ||
7938 | out: | ||
7939 | write_sequnlock_irq(&bb->lock); | ||
7940 | return rv; | ||
7941 | } | ||
7942 | |||
7943 | int rdev_clear_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors) | ||
7944 | { | ||
7945 | return md_clear_badblocks(&rdev->badblocks, | ||
7946 | s + rdev->data_offset, | ||
7947 | sectors); | ||
7948 | } | ||
7949 | EXPORT_SYMBOL_GPL(rdev_clear_badblocks); | ||
7950 | |||
7951 | /* | ||
7952 | * Acknowledge all bad blocks in a list. | ||
7953 | * This only succeeds if ->changed is clear. It is used by | ||
7954 | * in-kernel metadata updates | ||
7955 | */ | ||
7956 | void md_ack_all_badblocks(struct badblocks *bb) | ||
7957 | { | ||
7958 | if (bb->page == NULL || bb->changed) | ||
7959 | /* no point even trying */ | ||
7960 | return; | ||
7961 | write_seqlock_irq(&bb->lock); | ||
7962 | |||
7963 | if (bb->changed == 0) { | ||
7964 | u64 *p = bb->page; | ||
7965 | int i; | ||
7966 | for (i = 0; i < bb->count ; i++) { | ||
7967 | if (!BB_ACK(p[i])) { | ||
7968 | sector_t start = BB_OFFSET(p[i]); | ||
7969 | int len = BB_LEN(p[i]); | ||
7970 | p[i] = BB_MAKE(start, len, 1); | ||
7971 | } | ||
7972 | } | ||
7973 | bb->unacked_exist = 0; | ||
7974 | } | ||
7975 | write_sequnlock_irq(&bb->lock); | ||
7976 | } | ||
7977 | EXPORT_SYMBOL_GPL(md_ack_all_badblocks); | ||
7978 | |||
7979 | /* sysfs access to bad-blocks list. | ||
7980 | * We present two files. | ||
7981 | * 'bad-blocks' lists sector numbers and lengths of ranges that | ||
7982 | * are recorded as bad. The list is truncated to fit within | ||
7983 | * the one-page limit of sysfs. | ||
7984 | * Writing "sector length" to this file adds an acknowledged | ||
7985 | * bad block list. | ||
7986 | * 'unacknowledged-bad-blocks' lists bad blocks that have not yet | ||
7987 | * been acknowledged. Writing to this file adds bad blocks | ||
7988 | * without acknowledging them. This is largely for testing. | ||
7989 | */ | ||
7990 | |||
7991 | static ssize_t | ||
7992 | badblocks_show(struct badblocks *bb, char *page, int unack) | ||
7993 | { | ||
7994 | size_t len; | ||
7995 | int i; | ||
7996 | u64 *p = bb->page; | ||
7997 | unsigned seq; | ||
7998 | |||
7999 | if (bb->shift < 0) | ||
8000 | return 0; | ||
8001 | |||
8002 | retry: | ||
8003 | seq = read_seqbegin(&bb->lock); | ||
8004 | |||
8005 | len = 0; | ||
8006 | i = 0; | ||
8007 | |||
8008 | while (len < PAGE_SIZE && i < bb->count) { | ||
8009 | sector_t s = BB_OFFSET(p[i]); | ||
8010 | unsigned int length = BB_LEN(p[i]); | ||
8011 | int ack = BB_ACK(p[i]); | ||
8012 | i++; | ||
8013 | |||
8014 | if (unack && ack) | ||
8015 | continue; | ||
8016 | |||
8017 | len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n", | ||
8018 | (unsigned long long)s << bb->shift, | ||
8019 | length << bb->shift); | ||
8020 | } | ||
8021 | if (unack && len == 0) | ||
8022 | bb->unacked_exist = 0; | ||
8023 | |||
8024 | if (read_seqretry(&bb->lock, seq)) | ||
8025 | goto retry; | ||
8026 | |||
8027 | return len; | ||
8028 | } | ||
8029 | |||
8030 | #define DO_DEBUG 1 | ||
8031 | |||
8032 | static ssize_t | ||
8033 | badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack) | ||
8034 | { | ||
8035 | unsigned long long sector; | ||
8036 | int length; | ||
8037 | char newline; | ||
8038 | #ifdef DO_DEBUG | ||
8039 | /* Allow clearing via sysfs *only* for testing/debugging. | ||
8040 | * Normally only a successful write may clear a badblock | ||
8041 | */ | ||
8042 | int clear = 0; | ||
8043 | if (page[0] == '-') { | ||
8044 | clear = 1; | ||
8045 | page++; | ||
8046 | } | ||
8047 | #endif /* DO_DEBUG */ | ||
8048 | |||
8049 | switch (sscanf(page, "%llu %d%c", §or, &length, &newline)) { | ||
8050 | case 3: | ||
8051 | if (newline != '\n') | ||
8052 | return -EINVAL; | ||
8053 | case 2: | ||
8054 | if (length <= 0) | ||
8055 | return -EINVAL; | ||
8056 | break; | ||
8057 | default: | ||
8058 | return -EINVAL; | ||
8059 | } | ||
8060 | |||
8061 | #ifdef DO_DEBUG | ||
8062 | if (clear) { | ||
8063 | md_clear_badblocks(bb, sector, length); | ||
8064 | return len; | ||
8065 | } | ||
8066 | #endif /* DO_DEBUG */ | ||
8067 | if (md_set_badblocks(bb, sector, length, !unack)) | ||
8068 | return len; | ||
8069 | else | ||
8070 | return -ENOSPC; | ||
8071 | } | ||
8072 | |||
7350 | static int md_notify_reboot(struct notifier_block *this, | 8073 | static int md_notify_reboot(struct notifier_block *this, |
7351 | unsigned long code, void *x) | 8074 | unsigned long code, void *x) |
7352 | { | 8075 | { |
diff --git a/drivers/md/md.h b/drivers/md/md.h index 1c26c7a08ae..0a309dc29b4 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h | |||
@@ -29,6 +29,13 @@ | |||
29 | typedef struct mddev_s mddev_t; | 29 | typedef struct mddev_s mddev_t; |
30 | typedef struct mdk_rdev_s mdk_rdev_t; | 30 | typedef struct mdk_rdev_s mdk_rdev_t; |
31 | 31 | ||
32 | /* Bad block numbers are stored sorted in a single page. | ||
33 | * 64bits is used for each block or extent. | ||
34 | * 54 bits are sector number, 9 bits are extent size, | ||
35 | * 1 bit is an 'acknowledged' flag. | ||
36 | */ | ||
37 | #define MD_MAX_BADBLOCKS (PAGE_SIZE/8) | ||
38 | |||
32 | /* | 39 | /* |
33 | * MD's 'extended' device | 40 | * MD's 'extended' device |
34 | */ | 41 | */ |
@@ -48,7 +55,7 @@ struct mdk_rdev_s | |||
48 | struct block_device *meta_bdev; | 55 | struct block_device *meta_bdev; |
49 | struct block_device *bdev; /* block device handle */ | 56 | struct block_device *bdev; /* block device handle */ |
50 | 57 | ||
51 | struct page *sb_page; | 58 | struct page *sb_page, *bb_page; |
52 | int sb_loaded; | 59 | int sb_loaded; |
53 | __u64 sb_events; | 60 | __u64 sb_events; |
54 | sector_t data_offset; /* start of data in array */ | 61 | sector_t data_offset; /* start of data in array */ |
@@ -74,9 +81,29 @@ struct mdk_rdev_s | |||
74 | #define In_sync 2 /* device is in_sync with rest of array */ | 81 | #define In_sync 2 /* device is in_sync with rest of array */ |
75 | #define WriteMostly 4 /* Avoid reading if at all possible */ | 82 | #define WriteMostly 4 /* Avoid reading if at all possible */ |
76 | #define AutoDetected 7 /* added by auto-detect */ | 83 | #define AutoDetected 7 /* added by auto-detect */ |
77 | #define Blocked 8 /* An error occurred on an externally | 84 | #define Blocked 8 /* An error occurred but has not yet |
78 | * managed array, don't allow writes | 85 | * been acknowledged by the metadata |
86 | * handler, so don't allow writes | ||
79 | * until it is cleared */ | 87 | * until it is cleared */ |
88 | #define WriteErrorSeen 9 /* A write error has been seen on this | ||
89 | * device | ||
90 | */ | ||
91 | #define FaultRecorded 10 /* Intermediate state for clearing | ||
92 | * Blocked. The Fault is/will-be | ||
93 | * recorded in the metadata, but that | ||
94 | * metadata hasn't been stored safely | ||
95 | * on disk yet. | ||
96 | */ | ||
97 | #define BlockedBadBlocks 11 /* A writer is blocked because they | ||
98 | * found an unacknowledged bad-block. | ||
99 | * This can safely be cleared at any | ||
100 | * time, and the writer will re-check. | ||
101 | * It may be set at any time, and at | ||
102 | * worst the writer will timeout and | ||
103 | * re-check. So setting it as | ||
104 | * accurately as possible is good, but | ||
105 | * not absolutely critical. | ||
106 | */ | ||
80 | wait_queue_head_t blocked_wait; | 107 | wait_queue_head_t blocked_wait; |
81 | 108 | ||
82 | int desc_nr; /* descriptor index in the superblock */ | 109 | int desc_nr; /* descriptor index in the superblock */ |
@@ -111,8 +138,54 @@ struct mdk_rdev_s | |||
111 | 138 | ||
112 | struct sysfs_dirent *sysfs_state; /* handle for 'state' | 139 | struct sysfs_dirent *sysfs_state; /* handle for 'state' |
113 | * sysfs entry */ | 140 | * sysfs entry */ |
141 | |||
142 | struct badblocks { | ||
143 | int count; /* count of bad blocks */ | ||
144 | int unacked_exist; /* there probably are unacknowledged | ||
145 | * bad blocks. This is only cleared | ||
146 | * when a read discovers none | ||
147 | */ | ||
148 | int shift; /* shift from sectors to block size | ||
149 | * a -ve shift means badblocks are | ||
150 | * disabled.*/ | ||
151 | u64 *page; /* badblock list */ | ||
152 | int changed; | ||
153 | seqlock_t lock; | ||
154 | |||
155 | sector_t sector; | ||
156 | sector_t size; /* in sectors */ | ||
157 | } badblocks; | ||
114 | }; | 158 | }; |
115 | 159 | ||
160 | #define BB_LEN_MASK (0x00000000000001FFULL) | ||
161 | #define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL) | ||
162 | #define BB_ACK_MASK (0x8000000000000000ULL) | ||
163 | #define BB_MAX_LEN 512 | ||
164 | #define BB_OFFSET(x) (((x) & BB_OFFSET_MASK) >> 9) | ||
165 | #define BB_LEN(x) (((x) & BB_LEN_MASK) + 1) | ||
166 | #define BB_ACK(x) (!!((x) & BB_ACK_MASK)) | ||
167 | #define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63)) | ||
168 | |||
169 | extern int md_is_badblock(struct badblocks *bb, sector_t s, int sectors, | ||
170 | sector_t *first_bad, int *bad_sectors); | ||
171 | static inline int is_badblock(mdk_rdev_t *rdev, sector_t s, int sectors, | ||
172 | sector_t *first_bad, int *bad_sectors) | ||
173 | { | ||
174 | if (unlikely(rdev->badblocks.count)) { | ||
175 | int rv = md_is_badblock(&rdev->badblocks, rdev->data_offset + s, | ||
176 | sectors, | ||
177 | first_bad, bad_sectors); | ||
178 | if (rv) | ||
179 | *first_bad -= rdev->data_offset; | ||
180 | return rv; | ||
181 | } | ||
182 | return 0; | ||
183 | } | ||
184 | extern int rdev_set_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors, | ||
185 | int acknowledged); | ||
186 | extern int rdev_clear_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors); | ||
187 | extern void md_ack_all_badblocks(struct badblocks *bb); | ||
188 | |||
116 | struct mddev_s | 189 | struct mddev_s |
117 | { | 190 | { |
118 | void *private; | 191 | void *private; |
@@ -239,9 +312,12 @@ struct mddev_s | |||
239 | #define MD_RECOVERY_FROZEN 9 | 312 | #define MD_RECOVERY_FROZEN 9 |
240 | 313 | ||
241 | unsigned long recovery; | 314 | unsigned long recovery; |
242 | int recovery_disabled; /* if we detect that recovery | 315 | /* If a RAID personality determines that recovery (of a particular |
243 | * will always fail, set this | 316 | * device) will fail due to a read error on the source device, it |
244 | * so we don't loop trying */ | 317 | * takes a copy of this number and does not attempt recovery again |
318 | * until this number changes. | ||
319 | */ | ||
320 | int recovery_disabled; | ||
245 | 321 | ||
246 | int in_sync; /* know to not need resync */ | 322 | int in_sync; /* know to not need resync */ |
247 | /* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so | 323 | /* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so |
@@ -304,11 +380,6 @@ struct mddev_s | |||
304 | * hot-adding a bitmap. It should | 380 | * hot-adding a bitmap. It should |
305 | * eventually be settable by sysfs. | 381 | * eventually be settable by sysfs. |
306 | */ | 382 | */ |
307 | /* When md is serving under dm, it might use a | ||
308 | * dirty_log to store the bits. | ||
309 | */ | ||
310 | struct dm_dirty_log *log; | ||
311 | |||
312 | struct mutex mutex; | 383 | struct mutex mutex; |
313 | unsigned long chunksize; | 384 | unsigned long chunksize; |
314 | unsigned long daemon_sleep; /* how many jiffies between updates? */ | 385 | unsigned long daemon_sleep; /* how many jiffies between updates? */ |
@@ -413,6 +484,20 @@ static inline char * mdname (mddev_t * mddev) | |||
413 | return mddev->gendisk ? mddev->gendisk->disk_name : "mdX"; | 484 | return mddev->gendisk ? mddev->gendisk->disk_name : "mdX"; |
414 | } | 485 | } |
415 | 486 | ||
487 | static inline int sysfs_link_rdev(mddev_t *mddev, mdk_rdev_t *rdev) | ||
488 | { | ||
489 | char nm[20]; | ||
490 | sprintf(nm, "rd%d", rdev->raid_disk); | ||
491 | return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); | ||
492 | } | ||
493 | |||
494 | static inline void sysfs_unlink_rdev(mddev_t *mddev, mdk_rdev_t *rdev) | ||
495 | { | ||
496 | char nm[20]; | ||
497 | sprintf(nm, "rd%d", rdev->raid_disk); | ||
498 | sysfs_remove_link(&mddev->kobj, nm); | ||
499 | } | ||
500 | |||
416 | /* | 501 | /* |
417 | * iterates through some rdev ringlist. It's safe to remove the | 502 | * iterates through some rdev ringlist. It's safe to remove the |
418 | * current 'rdev'. Dont touch 'tmp' though. | 503 | * current 'rdev'. Dont touch 'tmp' though. |
@@ -475,7 +560,7 @@ extern int register_md_personality(struct mdk_personality *p); | |||
475 | extern int unregister_md_personality(struct mdk_personality *p); | 560 | extern int unregister_md_personality(struct mdk_personality *p); |
476 | extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev), | 561 | extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev), |
477 | mddev_t *mddev, const char *name); | 562 | mddev_t *mddev, const char *name); |
478 | extern void md_unregister_thread(mdk_thread_t *thread); | 563 | extern void md_unregister_thread(mdk_thread_t **threadp); |
479 | extern void md_wakeup_thread(mdk_thread_t *thread); | 564 | extern void md_wakeup_thread(mdk_thread_t *thread); |
480 | extern void md_check_recovery(mddev_t *mddev); | 565 | extern void md_check_recovery(mddev_t *mddev); |
481 | extern void md_write_start(mddev_t *mddev, struct bio *bi); | 566 | extern void md_write_start(mddev_t *mddev, struct bio *bi); |
@@ -505,7 +590,7 @@ extern void mddev_init(mddev_t *mddev); | |||
505 | extern int md_run(mddev_t *mddev); | 590 | extern int md_run(mddev_t *mddev); |
506 | extern void md_stop(mddev_t *mddev); | 591 | extern void md_stop(mddev_t *mddev); |
507 | extern void md_stop_writes(mddev_t *mddev); | 592 | extern void md_stop_writes(mddev_t *mddev); |
508 | extern void md_rdev_init(mdk_rdev_t *rdev); | 593 | extern int md_rdev_init(mdk_rdev_t *rdev); |
509 | 594 | ||
510 | extern void mddev_suspend(mddev_t *mddev); | 595 | extern void mddev_suspend(mddev_t *mddev); |
511 | extern void mddev_resume(mddev_t *mddev); | 596 | extern void mddev_resume(mddev_t *mddev); |
@@ -514,4 +599,5 @@ extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, | |||
514 | extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, | 599 | extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, |
515 | mddev_t *mddev); | 600 | mddev_t *mddev); |
516 | extern int mddev_check_plugged(mddev_t *mddev); | 601 | extern int mddev_check_plugged(mddev_t *mddev); |
602 | extern void md_trim_bio(struct bio *bio, int offset, int size); | ||
517 | #endif /* _MD_MD_H */ | 603 | #endif /* _MD_MD_H */ |
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 3535c23af28..d5b5fb30017 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c | |||
@@ -514,8 +514,7 @@ static int multipath_stop (mddev_t *mddev) | |||
514 | { | 514 | { |
515 | multipath_conf_t *conf = mddev->private; | 515 | multipath_conf_t *conf = mddev->private; |
516 | 516 | ||
517 | md_unregister_thread(mddev->thread); | 517 | md_unregister_thread(&mddev->thread); |
518 | mddev->thread = NULL; | ||
519 | blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ | 518 | blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ |
520 | mempool_destroy(conf->pool); | 519 | mempool_destroy(conf->pool); |
521 | kfree(conf->multipaths); | 520 | kfree(conf->multipaths); |
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index f7431b6d844..606fc04fd76 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
@@ -35,16 +35,13 @@ | |||
35 | #include <linux/delay.h> | 35 | #include <linux/delay.h> |
36 | #include <linux/blkdev.h> | 36 | #include <linux/blkdev.h> |
37 | #include <linux/seq_file.h> | 37 | #include <linux/seq_file.h> |
38 | #include <linux/ratelimit.h> | ||
38 | #include "md.h" | 39 | #include "md.h" |
39 | #include "raid1.h" | 40 | #include "raid1.h" |
40 | #include "bitmap.h" | 41 | #include "bitmap.h" |
41 | 42 | ||
42 | #define DEBUG 0 | 43 | #define DEBUG 0 |
43 | #if DEBUG | 44 | #define PRINTK(x...) do { if (DEBUG) printk(x); } while (0) |
44 | #define PRINTK(x...) printk(x) | ||
45 | #else | ||
46 | #define PRINTK(x...) | ||
47 | #endif | ||
48 | 45 | ||
49 | /* | 46 | /* |
50 | * Number of guaranteed r1bios in case of extreme VM load: | 47 | * Number of guaranteed r1bios in case of extreme VM load: |
@@ -166,7 +163,7 @@ static void put_all_bios(conf_t *conf, r1bio_t *r1_bio) | |||
166 | 163 | ||
167 | for (i = 0; i < conf->raid_disks; i++) { | 164 | for (i = 0; i < conf->raid_disks; i++) { |
168 | struct bio **bio = r1_bio->bios + i; | 165 | struct bio **bio = r1_bio->bios + i; |
169 | if (*bio && *bio != IO_BLOCKED) | 166 | if (!BIO_SPECIAL(*bio)) |
170 | bio_put(*bio); | 167 | bio_put(*bio); |
171 | *bio = NULL; | 168 | *bio = NULL; |
172 | } | 169 | } |
@@ -176,12 +173,6 @@ static void free_r1bio(r1bio_t *r1_bio) | |||
176 | { | 173 | { |
177 | conf_t *conf = r1_bio->mddev->private; | 174 | conf_t *conf = r1_bio->mddev->private; |
178 | 175 | ||
179 | /* | ||
180 | * Wake up any possible resync thread that waits for the device | ||
181 | * to go idle. | ||
182 | */ | ||
183 | allow_barrier(conf); | ||
184 | |||
185 | put_all_bios(conf, r1_bio); | 176 | put_all_bios(conf, r1_bio); |
186 | mempool_free(r1_bio, conf->r1bio_pool); | 177 | mempool_free(r1_bio, conf->r1bio_pool); |
187 | } | 178 | } |
@@ -222,6 +213,33 @@ static void reschedule_retry(r1bio_t *r1_bio) | |||
222 | * operation and are ready to return a success/failure code to the buffer | 213 | * operation and are ready to return a success/failure code to the buffer |
223 | * cache layer. | 214 | * cache layer. |
224 | */ | 215 | */ |
216 | static void call_bio_endio(r1bio_t *r1_bio) | ||
217 | { | ||
218 | struct bio *bio = r1_bio->master_bio; | ||
219 | int done; | ||
220 | conf_t *conf = r1_bio->mddev->private; | ||
221 | |||
222 | if (bio->bi_phys_segments) { | ||
223 | unsigned long flags; | ||
224 | spin_lock_irqsave(&conf->device_lock, flags); | ||
225 | bio->bi_phys_segments--; | ||
226 | done = (bio->bi_phys_segments == 0); | ||
227 | spin_unlock_irqrestore(&conf->device_lock, flags); | ||
228 | } else | ||
229 | done = 1; | ||
230 | |||
231 | if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) | ||
232 | clear_bit(BIO_UPTODATE, &bio->bi_flags); | ||
233 | if (done) { | ||
234 | bio_endio(bio, 0); | ||
235 | /* | ||
236 | * Wake up any possible resync thread that waits for the device | ||
237 | * to go idle. | ||
238 | */ | ||
239 | allow_barrier(conf); | ||
240 | } | ||
241 | } | ||
242 | |||
225 | static void raid_end_bio_io(r1bio_t *r1_bio) | 243 | static void raid_end_bio_io(r1bio_t *r1_bio) |
226 | { | 244 | { |
227 | struct bio *bio = r1_bio->master_bio; | 245 | struct bio *bio = r1_bio->master_bio; |
@@ -234,8 +252,7 @@ static void raid_end_bio_io(r1bio_t *r1_bio) | |||
234 | (unsigned long long) bio->bi_sector + | 252 | (unsigned long long) bio->bi_sector + |
235 | (bio->bi_size >> 9) - 1); | 253 | (bio->bi_size >> 9) - 1); |
236 | 254 | ||
237 | bio_endio(bio, | 255 | call_bio_endio(r1_bio); |
238 | test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO); | ||
239 | } | 256 | } |
240 | free_r1bio(r1_bio); | 257 | free_r1bio(r1_bio); |
241 | } | 258 | } |
@@ -287,36 +304,52 @@ static void raid1_end_read_request(struct bio *bio, int error) | |||
287 | * oops, read error: | 304 | * oops, read error: |
288 | */ | 305 | */ |
289 | char b[BDEVNAME_SIZE]; | 306 | char b[BDEVNAME_SIZE]; |
290 | if (printk_ratelimit()) | 307 | printk_ratelimited( |
291 | printk(KERN_ERR "md/raid1:%s: %s: rescheduling sector %llu\n", | 308 | KERN_ERR "md/raid1:%s: %s: " |
292 | mdname(conf->mddev), | 309 | "rescheduling sector %llu\n", |
293 | bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector); | 310 | mdname(conf->mddev), |
311 | bdevname(conf->mirrors[mirror].rdev->bdev, | ||
312 | b), | ||
313 | (unsigned long long)r1_bio->sector); | ||
314 | set_bit(R1BIO_ReadError, &r1_bio->state); | ||
294 | reschedule_retry(r1_bio); | 315 | reschedule_retry(r1_bio); |
295 | } | 316 | } |
296 | 317 | ||
297 | rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); | 318 | rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); |
298 | } | 319 | } |
299 | 320 | ||
321 | static void close_write(r1bio_t *r1_bio) | ||
322 | { | ||
323 | /* it really is the end of this request */ | ||
324 | if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { | ||
325 | /* free extra copy of the data pages */ | ||
326 | int i = r1_bio->behind_page_count; | ||
327 | while (i--) | ||
328 | safe_put_page(r1_bio->behind_bvecs[i].bv_page); | ||
329 | kfree(r1_bio->behind_bvecs); | ||
330 | r1_bio->behind_bvecs = NULL; | ||
331 | } | ||
332 | /* clear the bitmap if all writes complete successfully */ | ||
333 | bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, | ||
334 | r1_bio->sectors, | ||
335 | !test_bit(R1BIO_Degraded, &r1_bio->state), | ||
336 | test_bit(R1BIO_BehindIO, &r1_bio->state)); | ||
337 | md_write_end(r1_bio->mddev); | ||
338 | } | ||
339 | |||
300 | static void r1_bio_write_done(r1bio_t *r1_bio) | 340 | static void r1_bio_write_done(r1bio_t *r1_bio) |
301 | { | 341 | { |
302 | if (atomic_dec_and_test(&r1_bio->remaining)) | 342 | if (!atomic_dec_and_test(&r1_bio->remaining)) |
303 | { | 343 | return; |
304 | /* it really is the end of this request */ | 344 | |
305 | if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { | 345 | if (test_bit(R1BIO_WriteError, &r1_bio->state)) |
306 | /* free extra copy of the data pages */ | 346 | reschedule_retry(r1_bio); |
307 | int i = r1_bio->behind_page_count; | 347 | else { |
308 | while (i--) | 348 | close_write(r1_bio); |
309 | safe_put_page(r1_bio->behind_pages[i]); | 349 | if (test_bit(R1BIO_MadeGood, &r1_bio->state)) |
310 | kfree(r1_bio->behind_pages); | 350 | reschedule_retry(r1_bio); |
311 | r1_bio->behind_pages = NULL; | 351 | else |
312 | } | 352 | raid_end_bio_io(r1_bio); |
313 | /* clear the bitmap if all writes complete successfully */ | ||
314 | bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, | ||
315 | r1_bio->sectors, | ||
316 | !test_bit(R1BIO_Degraded, &r1_bio->state), | ||
317 | test_bit(R1BIO_BehindIO, &r1_bio->state)); | ||
318 | md_write_end(r1_bio->mddev); | ||
319 | raid_end_bio_io(r1_bio); | ||
320 | } | 353 | } |
321 | } | 354 | } |
322 | 355 | ||
@@ -336,13 +369,11 @@ static void raid1_end_write_request(struct bio *bio, int error) | |||
336 | /* | 369 | /* |
337 | * 'one mirror IO has finished' event handler: | 370 | * 'one mirror IO has finished' event handler: |
338 | */ | 371 | */ |
339 | r1_bio->bios[mirror] = NULL; | ||
340 | to_put = bio; | ||
341 | if (!uptodate) { | 372 | if (!uptodate) { |
342 | md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); | 373 | set_bit(WriteErrorSeen, |
343 | /* an I/O failed, we can't clear the bitmap */ | 374 | &conf->mirrors[mirror].rdev->flags); |
344 | set_bit(R1BIO_Degraded, &r1_bio->state); | 375 | set_bit(R1BIO_WriteError, &r1_bio->state); |
345 | } else | 376 | } else { |
346 | /* | 377 | /* |
347 | * Set R1BIO_Uptodate in our master bio, so that we | 378 | * Set R1BIO_Uptodate in our master bio, so that we |
348 | * will return a good error code for to the higher | 379 | * will return a good error code for to the higher |
@@ -353,8 +384,22 @@ static void raid1_end_write_request(struct bio *bio, int error) | |||
353 | * to user-side. So if something waits for IO, then it | 384 | * to user-side. So if something waits for IO, then it |
354 | * will wait for the 'master' bio. | 385 | * will wait for the 'master' bio. |
355 | */ | 386 | */ |
387 | sector_t first_bad; | ||
388 | int bad_sectors; | ||
389 | |||
390 | r1_bio->bios[mirror] = NULL; | ||
391 | to_put = bio; | ||
356 | set_bit(R1BIO_Uptodate, &r1_bio->state); | 392 | set_bit(R1BIO_Uptodate, &r1_bio->state); |
357 | 393 | ||
394 | /* Maybe we can clear some bad blocks. */ | ||
395 | if (is_badblock(conf->mirrors[mirror].rdev, | ||
396 | r1_bio->sector, r1_bio->sectors, | ||
397 | &first_bad, &bad_sectors)) { | ||
398 | r1_bio->bios[mirror] = IO_MADE_GOOD; | ||
399 | set_bit(R1BIO_MadeGood, &r1_bio->state); | ||
400 | } | ||
401 | } | ||
402 | |||
358 | update_head_pos(mirror, r1_bio); | 403 | update_head_pos(mirror, r1_bio); |
359 | 404 | ||
360 | if (behind) { | 405 | if (behind) { |
@@ -377,11 +422,13 @@ static void raid1_end_write_request(struct bio *bio, int error) | |||
377 | (unsigned long long) mbio->bi_sector, | 422 | (unsigned long long) mbio->bi_sector, |
378 | (unsigned long long) mbio->bi_sector + | 423 | (unsigned long long) mbio->bi_sector + |
379 | (mbio->bi_size >> 9) - 1); | 424 | (mbio->bi_size >> 9) - 1); |
380 | bio_endio(mbio, 0); | 425 | call_bio_endio(r1_bio); |
381 | } | 426 | } |
382 | } | 427 | } |
383 | } | 428 | } |
384 | rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); | 429 | if (r1_bio->bios[mirror] == NULL) |
430 | rdev_dec_pending(conf->mirrors[mirror].rdev, | ||
431 | conf->mddev); | ||
385 | 432 | ||
386 | /* | 433 | /* |
387 | * Let's see if all mirrored write operations have finished | 434 | * Let's see if all mirrored write operations have finished |
@@ -408,10 +455,11 @@ static void raid1_end_write_request(struct bio *bio, int error) | |||
408 | * | 455 | * |
409 | * The rdev for the device selected will have nr_pending incremented. | 456 | * The rdev for the device selected will have nr_pending incremented. |
410 | */ | 457 | */ |
411 | static int read_balance(conf_t *conf, r1bio_t *r1_bio) | 458 | static int read_balance(conf_t *conf, r1bio_t *r1_bio, int *max_sectors) |
412 | { | 459 | { |
413 | const sector_t this_sector = r1_bio->sector; | 460 | const sector_t this_sector = r1_bio->sector; |
414 | const int sectors = r1_bio->sectors; | 461 | int sectors; |
462 | int best_good_sectors; | ||
415 | int start_disk; | 463 | int start_disk; |
416 | int best_disk; | 464 | int best_disk; |
417 | int i; | 465 | int i; |
@@ -426,8 +474,11 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) | |||
426 | * We take the first readable disk when above the resync window. | 474 | * We take the first readable disk when above the resync window. |
427 | */ | 475 | */ |
428 | retry: | 476 | retry: |
477 | sectors = r1_bio->sectors; | ||
429 | best_disk = -1; | 478 | best_disk = -1; |
430 | best_dist = MaxSector; | 479 | best_dist = MaxSector; |
480 | best_good_sectors = 0; | ||
481 | |||
431 | if (conf->mddev->recovery_cp < MaxSector && | 482 | if (conf->mddev->recovery_cp < MaxSector && |
432 | (this_sector + sectors >= conf->next_resync)) { | 483 | (this_sector + sectors >= conf->next_resync)) { |
433 | choose_first = 1; | 484 | choose_first = 1; |
@@ -439,6 +490,9 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) | |||
439 | 490 | ||
440 | for (i = 0 ; i < conf->raid_disks ; i++) { | 491 | for (i = 0 ; i < conf->raid_disks ; i++) { |
441 | sector_t dist; | 492 | sector_t dist; |
493 | sector_t first_bad; | ||
494 | int bad_sectors; | ||
495 | |||
442 | int disk = start_disk + i; | 496 | int disk = start_disk + i; |
443 | if (disk >= conf->raid_disks) | 497 | if (disk >= conf->raid_disks) |
444 | disk -= conf->raid_disks; | 498 | disk -= conf->raid_disks; |
@@ -454,13 +508,51 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) | |||
454 | if (test_bit(WriteMostly, &rdev->flags)) { | 508 | if (test_bit(WriteMostly, &rdev->flags)) { |
455 | /* Don't balance among write-mostly, just | 509 | /* Don't balance among write-mostly, just |
456 | * use the first as a last resort */ | 510 | * use the first as a last resort */ |
457 | if (best_disk < 0) | 511 | if (best_disk < 0) { |
512 | if (is_badblock(rdev, this_sector, sectors, | ||
513 | &first_bad, &bad_sectors)) { | ||
514 | if (first_bad < this_sector) | ||
515 | /* Cannot use this */ | ||
516 | continue; | ||
517 | best_good_sectors = first_bad - this_sector; | ||
518 | } else | ||
519 | best_good_sectors = sectors; | ||
458 | best_disk = disk; | 520 | best_disk = disk; |
521 | } | ||
459 | continue; | 522 | continue; |
460 | } | 523 | } |
461 | /* This is a reasonable device to use. It might | 524 | /* This is a reasonable device to use. It might |
462 | * even be best. | 525 | * even be best. |
463 | */ | 526 | */ |
527 | if (is_badblock(rdev, this_sector, sectors, | ||
528 | &first_bad, &bad_sectors)) { | ||
529 | if (best_dist < MaxSector) | ||
530 | /* already have a better device */ | ||
531 | continue; | ||
532 | if (first_bad <= this_sector) { | ||
533 | /* cannot read here. If this is the 'primary' | ||
534 | * device, then we must not read beyond | ||
535 | * bad_sectors from another device.. | ||
536 | */ | ||
537 | bad_sectors -= (this_sector - first_bad); | ||
538 | if (choose_first && sectors > bad_sectors) | ||
539 | sectors = bad_sectors; | ||
540 | if (best_good_sectors > sectors) | ||
541 | best_good_sectors = sectors; | ||
542 | |||
543 | } else { | ||
544 | sector_t good_sectors = first_bad - this_sector; | ||
545 | if (good_sectors > best_good_sectors) { | ||
546 | best_good_sectors = good_sectors; | ||
547 | best_disk = disk; | ||
548 | } | ||
549 | if (choose_first) | ||
550 | break; | ||
551 | } | ||
552 | continue; | ||
553 | } else | ||
554 | best_good_sectors = sectors; | ||
555 | |||
464 | dist = abs(this_sector - conf->mirrors[disk].head_position); | 556 | dist = abs(this_sector - conf->mirrors[disk].head_position); |
465 | if (choose_first | 557 | if (choose_first |
466 | /* Don't change to another disk for sequential reads */ | 558 | /* Don't change to another disk for sequential reads */ |
@@ -489,10 +581,12 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) | |||
489 | rdev_dec_pending(rdev, conf->mddev); | 581 | rdev_dec_pending(rdev, conf->mddev); |
490 | goto retry; | 582 | goto retry; |
491 | } | 583 | } |
584 | sectors = best_good_sectors; | ||
492 | conf->next_seq_sect = this_sector + sectors; | 585 | conf->next_seq_sect = this_sector + sectors; |
493 | conf->last_used = best_disk; | 586 | conf->last_used = best_disk; |
494 | } | 587 | } |
495 | rcu_read_unlock(); | 588 | rcu_read_unlock(); |
589 | *max_sectors = sectors; | ||
496 | 590 | ||
497 | return best_disk; | 591 | return best_disk; |
498 | } | 592 | } |
@@ -672,30 +766,31 @@ static void alloc_behind_pages(struct bio *bio, r1bio_t *r1_bio) | |||
672 | { | 766 | { |
673 | int i; | 767 | int i; |
674 | struct bio_vec *bvec; | 768 | struct bio_vec *bvec; |
675 | struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page*), | 769 | struct bio_vec *bvecs = kzalloc(bio->bi_vcnt * sizeof(struct bio_vec), |
676 | GFP_NOIO); | 770 | GFP_NOIO); |
677 | if (unlikely(!pages)) | 771 | if (unlikely(!bvecs)) |
678 | return; | 772 | return; |
679 | 773 | ||
680 | bio_for_each_segment(bvec, bio, i) { | 774 | bio_for_each_segment(bvec, bio, i) { |
681 | pages[i] = alloc_page(GFP_NOIO); | 775 | bvecs[i] = *bvec; |
682 | if (unlikely(!pages[i])) | 776 | bvecs[i].bv_page = alloc_page(GFP_NOIO); |
777 | if (unlikely(!bvecs[i].bv_page)) | ||
683 | goto do_sync_io; | 778 | goto do_sync_io; |
684 | memcpy(kmap(pages[i]) + bvec->bv_offset, | 779 | memcpy(kmap(bvecs[i].bv_page) + bvec->bv_offset, |
685 | kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len); | 780 | kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len); |
686 | kunmap(pages[i]); | 781 | kunmap(bvecs[i].bv_page); |
687 | kunmap(bvec->bv_page); | 782 | kunmap(bvec->bv_page); |
688 | } | 783 | } |
689 | r1_bio->behind_pages = pages; | 784 | r1_bio->behind_bvecs = bvecs; |
690 | r1_bio->behind_page_count = bio->bi_vcnt; | 785 | r1_bio->behind_page_count = bio->bi_vcnt; |
691 | set_bit(R1BIO_BehindIO, &r1_bio->state); | 786 | set_bit(R1BIO_BehindIO, &r1_bio->state); |
692 | return; | 787 | return; |
693 | 788 | ||
694 | do_sync_io: | 789 | do_sync_io: |
695 | for (i = 0; i < bio->bi_vcnt; i++) | 790 | for (i = 0; i < bio->bi_vcnt; i++) |
696 | if (pages[i]) | 791 | if (bvecs[i].bv_page) |
697 | put_page(pages[i]); | 792 | put_page(bvecs[i].bv_page); |
698 | kfree(pages); | 793 | kfree(bvecs); |
699 | PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); | 794 | PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); |
700 | } | 795 | } |
701 | 796 | ||
@@ -705,7 +800,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
705 | mirror_info_t *mirror; | 800 | mirror_info_t *mirror; |
706 | r1bio_t *r1_bio; | 801 | r1bio_t *r1_bio; |
707 | struct bio *read_bio; | 802 | struct bio *read_bio; |
708 | int i, targets = 0, disks; | 803 | int i, disks; |
709 | struct bitmap *bitmap; | 804 | struct bitmap *bitmap; |
710 | unsigned long flags; | 805 | unsigned long flags; |
711 | const int rw = bio_data_dir(bio); | 806 | const int rw = bio_data_dir(bio); |
@@ -713,6 +808,9 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
713 | const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); | 808 | const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); |
714 | mdk_rdev_t *blocked_rdev; | 809 | mdk_rdev_t *blocked_rdev; |
715 | int plugged; | 810 | int plugged; |
811 | int first_clone; | ||
812 | int sectors_handled; | ||
813 | int max_sectors; | ||
716 | 814 | ||
717 | /* | 815 | /* |
718 | * Register the new request and wait if the reconstruction | 816 | * Register the new request and wait if the reconstruction |
@@ -759,11 +857,24 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
759 | r1_bio->mddev = mddev; | 857 | r1_bio->mddev = mddev; |
760 | r1_bio->sector = bio->bi_sector; | 858 | r1_bio->sector = bio->bi_sector; |
761 | 859 | ||
860 | /* We might need to issue multiple reads to different | ||
861 | * devices if there are bad blocks around, so we keep | ||
862 | * track of the number of reads in bio->bi_phys_segments. | ||
863 | * If this is 0, there is only one r1_bio and no locking | ||
864 | * will be needed when requests complete. If it is | ||
865 | * non-zero, then it is the number of not-completed requests. | ||
866 | */ | ||
867 | bio->bi_phys_segments = 0; | ||
868 | clear_bit(BIO_SEG_VALID, &bio->bi_flags); | ||
869 | |||
762 | if (rw == READ) { | 870 | if (rw == READ) { |
763 | /* | 871 | /* |
764 | * read balancing logic: | 872 | * read balancing logic: |
765 | */ | 873 | */ |
766 | int rdisk = read_balance(conf, r1_bio); | 874 | int rdisk; |
875 | |||
876 | read_again: | ||
877 | rdisk = read_balance(conf, r1_bio, &max_sectors); | ||
767 | 878 | ||
768 | if (rdisk < 0) { | 879 | if (rdisk < 0) { |
769 | /* couldn't find anywhere to read from */ | 880 | /* couldn't find anywhere to read from */ |
@@ -784,6 +895,8 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
784 | r1_bio->read_disk = rdisk; | 895 | r1_bio->read_disk = rdisk; |
785 | 896 | ||
786 | read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); | 897 | read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); |
898 | md_trim_bio(read_bio, r1_bio->sector - bio->bi_sector, | ||
899 | max_sectors); | ||
787 | 900 | ||
788 | r1_bio->bios[rdisk] = read_bio; | 901 | r1_bio->bios[rdisk] = read_bio; |
789 | 902 | ||
@@ -793,16 +906,52 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
793 | read_bio->bi_rw = READ | do_sync; | 906 | read_bio->bi_rw = READ | do_sync; |
794 | read_bio->bi_private = r1_bio; | 907 | read_bio->bi_private = r1_bio; |
795 | 908 | ||
796 | generic_make_request(read_bio); | 909 | if (max_sectors < r1_bio->sectors) { |
910 | /* could not read all from this device, so we will | ||
911 | * need another r1_bio. | ||
912 | */ | ||
913 | |||
914 | sectors_handled = (r1_bio->sector + max_sectors | ||
915 | - bio->bi_sector); | ||
916 | r1_bio->sectors = max_sectors; | ||
917 | spin_lock_irq(&conf->device_lock); | ||
918 | if (bio->bi_phys_segments == 0) | ||
919 | bio->bi_phys_segments = 2; | ||
920 | else | ||
921 | bio->bi_phys_segments++; | ||
922 | spin_unlock_irq(&conf->device_lock); | ||
923 | /* Cannot call generic_make_request directly | ||
924 | * as that will be queued in __make_request | ||
925 | * and subsequent mempool_alloc might block waiting | ||
926 | * for it. So hand bio over to raid1d. | ||
927 | */ | ||
928 | reschedule_retry(r1_bio); | ||
929 | |||
930 | r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); | ||
931 | |||
932 | r1_bio->master_bio = bio; | ||
933 | r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled; | ||
934 | r1_bio->state = 0; | ||
935 | r1_bio->mddev = mddev; | ||
936 | r1_bio->sector = bio->bi_sector + sectors_handled; | ||
937 | goto read_again; | ||
938 | } else | ||
939 | generic_make_request(read_bio); | ||
797 | return 0; | 940 | return 0; |
798 | } | 941 | } |
799 | 942 | ||
800 | /* | 943 | /* |
801 | * WRITE: | 944 | * WRITE: |
802 | */ | 945 | */ |
803 | /* first select target devices under spinlock and | 946 | /* first select target devices under rcu_lock and |
804 | * inc refcount on their rdev. Record them by setting | 947 | * inc refcount on their rdev. Record them by setting |
805 | * bios[x] to bio | 948 | * bios[x] to bio |
949 | * If there are known/acknowledged bad blocks on any device on | ||
950 | * which we have seen a write error, we want to avoid writing those | ||
951 | * blocks. | ||
952 | * This potentially requires several writes to write around | ||
953 | * the bad blocks. Each set of writes gets it's own r1bio | ||
954 | * with a set of bios attached. | ||
806 | */ | 955 | */ |
807 | plugged = mddev_check_plugged(mddev); | 956 | plugged = mddev_check_plugged(mddev); |
808 | 957 | ||
@@ -810,6 +959,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
810 | retry_write: | 959 | retry_write: |
811 | blocked_rdev = NULL; | 960 | blocked_rdev = NULL; |
812 | rcu_read_lock(); | 961 | rcu_read_lock(); |
962 | max_sectors = r1_bio->sectors; | ||
813 | for (i = 0; i < disks; i++) { | 963 | for (i = 0; i < disks; i++) { |
814 | mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); | 964 | mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); |
815 | if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { | 965 | if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { |
@@ -817,17 +967,56 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
817 | blocked_rdev = rdev; | 967 | blocked_rdev = rdev; |
818 | break; | 968 | break; |
819 | } | 969 | } |
820 | if (rdev && !test_bit(Faulty, &rdev->flags)) { | 970 | r1_bio->bios[i] = NULL; |
821 | atomic_inc(&rdev->nr_pending); | 971 | if (!rdev || test_bit(Faulty, &rdev->flags)) { |
822 | if (test_bit(Faulty, &rdev->flags)) { | 972 | set_bit(R1BIO_Degraded, &r1_bio->state); |
973 | continue; | ||
974 | } | ||
975 | |||
976 | atomic_inc(&rdev->nr_pending); | ||
977 | if (test_bit(WriteErrorSeen, &rdev->flags)) { | ||
978 | sector_t first_bad; | ||
979 | int bad_sectors; | ||
980 | int is_bad; | ||
981 | |||
982 | is_bad = is_badblock(rdev, r1_bio->sector, | ||
983 | max_sectors, | ||
984 | &first_bad, &bad_sectors); | ||
985 | if (is_bad < 0) { | ||
986 | /* mustn't write here until the bad block is | ||
987 | * acknowledged*/ | ||
988 | set_bit(BlockedBadBlocks, &rdev->flags); | ||
989 | blocked_rdev = rdev; | ||
990 | break; | ||
991 | } | ||
992 | if (is_bad && first_bad <= r1_bio->sector) { | ||
993 | /* Cannot write here at all */ | ||
994 | bad_sectors -= (r1_bio->sector - first_bad); | ||
995 | if (bad_sectors < max_sectors) | ||
996 | /* mustn't write more than bad_sectors | ||
997 | * to other devices yet | ||
998 | */ | ||
999 | max_sectors = bad_sectors; | ||
823 | rdev_dec_pending(rdev, mddev); | 1000 | rdev_dec_pending(rdev, mddev); |
824 | r1_bio->bios[i] = NULL; | 1001 | /* We don't set R1BIO_Degraded as that |
825 | } else { | 1002 | * only applies if the disk is |
826 | r1_bio->bios[i] = bio; | 1003 | * missing, so it might be re-added, |
827 | targets++; | 1004 | * and we want to know to recover this |
1005 | * chunk. | ||
1006 | * In this case the device is here, | ||
1007 | * and the fact that this chunk is not | ||
1008 | * in-sync is recorded in the bad | ||
1009 | * block log | ||
1010 | */ | ||
1011 | continue; | ||
828 | } | 1012 | } |
829 | } else | 1013 | if (is_bad) { |
830 | r1_bio->bios[i] = NULL; | 1014 | int good_sectors = first_bad - r1_bio->sector; |
1015 | if (good_sectors < max_sectors) | ||
1016 | max_sectors = good_sectors; | ||
1017 | } | ||
1018 | } | ||
1019 | r1_bio->bios[i] = bio; | ||
831 | } | 1020 | } |
832 | rcu_read_unlock(); | 1021 | rcu_read_unlock(); |
833 | 1022 | ||
@@ -838,51 +1027,57 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
838 | for (j = 0; j < i; j++) | 1027 | for (j = 0; j < i; j++) |
839 | if (r1_bio->bios[j]) | 1028 | if (r1_bio->bios[j]) |
840 | rdev_dec_pending(conf->mirrors[j].rdev, mddev); | 1029 | rdev_dec_pending(conf->mirrors[j].rdev, mddev); |
841 | 1030 | r1_bio->state = 0; | |
842 | allow_barrier(conf); | 1031 | allow_barrier(conf); |
843 | md_wait_for_blocked_rdev(blocked_rdev, mddev); | 1032 | md_wait_for_blocked_rdev(blocked_rdev, mddev); |
844 | wait_barrier(conf); | 1033 | wait_barrier(conf); |
845 | goto retry_write; | 1034 | goto retry_write; |
846 | } | 1035 | } |
847 | 1036 | ||
848 | BUG_ON(targets == 0); /* we never fail the last device */ | 1037 | if (max_sectors < r1_bio->sectors) { |
849 | 1038 | /* We are splitting this write into multiple parts, so | |
850 | if (targets < conf->raid_disks) { | 1039 | * we need to prepare for allocating another r1_bio. |
851 | /* array is degraded, we will not clear the bitmap | 1040 | */ |
852 | * on I/O completion (see raid1_end_write_request) */ | 1041 | r1_bio->sectors = max_sectors; |
853 | set_bit(R1BIO_Degraded, &r1_bio->state); | 1042 | spin_lock_irq(&conf->device_lock); |
1043 | if (bio->bi_phys_segments == 0) | ||
1044 | bio->bi_phys_segments = 2; | ||
1045 | else | ||
1046 | bio->bi_phys_segments++; | ||
1047 | spin_unlock_irq(&conf->device_lock); | ||
854 | } | 1048 | } |
855 | 1049 | sectors_handled = r1_bio->sector + max_sectors - bio->bi_sector; | |
856 | /* do behind I/O ? | ||
857 | * Not if there are too many, or cannot allocate memory, | ||
858 | * or a reader on WriteMostly is waiting for behind writes | ||
859 | * to flush */ | ||
860 | if (bitmap && | ||
861 | (atomic_read(&bitmap->behind_writes) | ||
862 | < mddev->bitmap_info.max_write_behind) && | ||
863 | !waitqueue_active(&bitmap->behind_wait)) | ||
864 | alloc_behind_pages(bio, r1_bio); | ||
865 | 1050 | ||
866 | atomic_set(&r1_bio->remaining, 1); | 1051 | atomic_set(&r1_bio->remaining, 1); |
867 | atomic_set(&r1_bio->behind_remaining, 0); | 1052 | atomic_set(&r1_bio->behind_remaining, 0); |
868 | 1053 | ||
869 | bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors, | 1054 | first_clone = 1; |
870 | test_bit(R1BIO_BehindIO, &r1_bio->state)); | ||
871 | for (i = 0; i < disks; i++) { | 1055 | for (i = 0; i < disks; i++) { |
872 | struct bio *mbio; | 1056 | struct bio *mbio; |
873 | if (!r1_bio->bios[i]) | 1057 | if (!r1_bio->bios[i]) |
874 | continue; | 1058 | continue; |
875 | 1059 | ||
876 | mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); | 1060 | mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); |
877 | r1_bio->bios[i] = mbio; | 1061 | md_trim_bio(mbio, r1_bio->sector - bio->bi_sector, max_sectors); |
878 | 1062 | ||
879 | mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; | 1063 | if (first_clone) { |
880 | mbio->bi_bdev = conf->mirrors[i].rdev->bdev; | 1064 | /* do behind I/O ? |
881 | mbio->bi_end_io = raid1_end_write_request; | 1065 | * Not if there are too many, or cannot |
882 | mbio->bi_rw = WRITE | do_flush_fua | do_sync; | 1066 | * allocate memory, or a reader on WriteMostly |
883 | mbio->bi_private = r1_bio; | 1067 | * is waiting for behind writes to flush */ |
884 | 1068 | if (bitmap && | |
885 | if (r1_bio->behind_pages) { | 1069 | (atomic_read(&bitmap->behind_writes) |
1070 | < mddev->bitmap_info.max_write_behind) && | ||
1071 | !waitqueue_active(&bitmap->behind_wait)) | ||
1072 | alloc_behind_pages(mbio, r1_bio); | ||
1073 | |||
1074 | bitmap_startwrite(bitmap, r1_bio->sector, | ||
1075 | r1_bio->sectors, | ||
1076 | test_bit(R1BIO_BehindIO, | ||
1077 | &r1_bio->state)); | ||
1078 | first_clone = 0; | ||
1079 | } | ||
1080 | if (r1_bio->behind_bvecs) { | ||
886 | struct bio_vec *bvec; | 1081 | struct bio_vec *bvec; |
887 | int j; | 1082 | int j; |
888 | 1083 | ||
@@ -894,16 +1089,42 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
894 | * them all | 1089 | * them all |
895 | */ | 1090 | */ |
896 | __bio_for_each_segment(bvec, mbio, j, 0) | 1091 | __bio_for_each_segment(bvec, mbio, j, 0) |
897 | bvec->bv_page = r1_bio->behind_pages[j]; | 1092 | bvec->bv_page = r1_bio->behind_bvecs[j].bv_page; |
898 | if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) | 1093 | if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) |
899 | atomic_inc(&r1_bio->behind_remaining); | 1094 | atomic_inc(&r1_bio->behind_remaining); |
900 | } | 1095 | } |
901 | 1096 | ||
1097 | r1_bio->bios[i] = mbio; | ||
1098 | |||
1099 | mbio->bi_sector = (r1_bio->sector + | ||
1100 | conf->mirrors[i].rdev->data_offset); | ||
1101 | mbio->bi_bdev = conf->mirrors[i].rdev->bdev; | ||
1102 | mbio->bi_end_io = raid1_end_write_request; | ||
1103 | mbio->bi_rw = WRITE | do_flush_fua | do_sync; | ||
1104 | mbio->bi_private = r1_bio; | ||
1105 | |||
902 | atomic_inc(&r1_bio->remaining); | 1106 | atomic_inc(&r1_bio->remaining); |
903 | spin_lock_irqsave(&conf->device_lock, flags); | 1107 | spin_lock_irqsave(&conf->device_lock, flags); |
904 | bio_list_add(&conf->pending_bio_list, mbio); | 1108 | bio_list_add(&conf->pending_bio_list, mbio); |
905 | spin_unlock_irqrestore(&conf->device_lock, flags); | 1109 | spin_unlock_irqrestore(&conf->device_lock, flags); |
906 | } | 1110 | } |
1111 | /* Mustn't call r1_bio_write_done before this next test, | ||
1112 | * as it could result in the bio being freed. | ||
1113 | */ | ||
1114 | if (sectors_handled < (bio->bi_size >> 9)) { | ||
1115 | r1_bio_write_done(r1_bio); | ||
1116 | /* We need another r1_bio. It has already been counted | ||
1117 | * in bio->bi_phys_segments | ||
1118 | */ | ||
1119 | r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); | ||
1120 | r1_bio->master_bio = bio; | ||
1121 | r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled; | ||
1122 | r1_bio->state = 0; | ||
1123 | r1_bio->mddev = mddev; | ||
1124 | r1_bio->sector = bio->bi_sector + sectors_handled; | ||
1125 | goto retry_write; | ||
1126 | } | ||
1127 | |||
907 | r1_bio_write_done(r1_bio); | 1128 | r1_bio_write_done(r1_bio); |
908 | 1129 | ||
909 | /* In case raid1d snuck in to freeze_array */ | 1130 | /* In case raid1d snuck in to freeze_array */ |
@@ -952,9 +1173,10 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) | |||
952 | * However don't try a recovery from this drive as | 1173 | * However don't try a recovery from this drive as |
953 | * it is very likely to fail. | 1174 | * it is very likely to fail. |
954 | */ | 1175 | */ |
955 | mddev->recovery_disabled = 1; | 1176 | conf->recovery_disabled = mddev->recovery_disabled; |
956 | return; | 1177 | return; |
957 | } | 1178 | } |
1179 | set_bit(Blocked, &rdev->flags); | ||
958 | if (test_and_clear_bit(In_sync, &rdev->flags)) { | 1180 | if (test_and_clear_bit(In_sync, &rdev->flags)) { |
959 | unsigned long flags; | 1181 | unsigned long flags; |
960 | spin_lock_irqsave(&conf->device_lock, flags); | 1182 | spin_lock_irqsave(&conf->device_lock, flags); |
@@ -1027,7 +1249,7 @@ static int raid1_spare_active(mddev_t *mddev) | |||
1027 | && !test_bit(Faulty, &rdev->flags) | 1249 | && !test_bit(Faulty, &rdev->flags) |
1028 | && !test_and_set_bit(In_sync, &rdev->flags)) { | 1250 | && !test_and_set_bit(In_sync, &rdev->flags)) { |
1029 | count++; | 1251 | count++; |
1030 | sysfs_notify_dirent(rdev->sysfs_state); | 1252 | sysfs_notify_dirent_safe(rdev->sysfs_state); |
1031 | } | 1253 | } |
1032 | } | 1254 | } |
1033 | spin_lock_irqsave(&conf->device_lock, flags); | 1255 | spin_lock_irqsave(&conf->device_lock, flags); |
@@ -1048,6 +1270,9 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1048 | int first = 0; | 1270 | int first = 0; |
1049 | int last = mddev->raid_disks - 1; | 1271 | int last = mddev->raid_disks - 1; |
1050 | 1272 | ||
1273 | if (mddev->recovery_disabled == conf->recovery_disabled) | ||
1274 | return -EBUSY; | ||
1275 | |||
1051 | if (rdev->raid_disk >= 0) | 1276 | if (rdev->raid_disk >= 0) |
1052 | first = last = rdev->raid_disk; | 1277 | first = last = rdev->raid_disk; |
1053 | 1278 | ||
@@ -1103,7 +1328,7 @@ static int raid1_remove_disk(mddev_t *mddev, int number) | |||
1103 | * is not possible. | 1328 | * is not possible. |
1104 | */ | 1329 | */ |
1105 | if (!test_bit(Faulty, &rdev->flags) && | 1330 | if (!test_bit(Faulty, &rdev->flags) && |
1106 | !mddev->recovery_disabled && | 1331 | mddev->recovery_disabled != conf->recovery_disabled && |
1107 | mddev->degraded < conf->raid_disks) { | 1332 | mddev->degraded < conf->raid_disks) { |
1108 | err = -EBUSY; | 1333 | err = -EBUSY; |
1109 | goto abort; | 1334 | goto abort; |
@@ -1155,6 +1380,8 @@ static void end_sync_write(struct bio *bio, int error) | |||
1155 | conf_t *conf = mddev->private; | 1380 | conf_t *conf = mddev->private; |
1156 | int i; | 1381 | int i; |
1157 | int mirror=0; | 1382 | int mirror=0; |
1383 | sector_t first_bad; | ||
1384 | int bad_sectors; | ||
1158 | 1385 | ||
1159 | for (i = 0; i < conf->raid_disks; i++) | 1386 | for (i = 0; i < conf->raid_disks; i++) |
1160 | if (r1_bio->bios[i] == bio) { | 1387 | if (r1_bio->bios[i] == bio) { |
@@ -1172,18 +1399,48 @@ static void end_sync_write(struct bio *bio, int error) | |||
1172 | s += sync_blocks; | 1399 | s += sync_blocks; |
1173 | sectors_to_go -= sync_blocks; | 1400 | sectors_to_go -= sync_blocks; |
1174 | } while (sectors_to_go > 0); | 1401 | } while (sectors_to_go > 0); |
1175 | md_error(mddev, conf->mirrors[mirror].rdev); | 1402 | set_bit(WriteErrorSeen, |
1176 | } | 1403 | &conf->mirrors[mirror].rdev->flags); |
1404 | set_bit(R1BIO_WriteError, &r1_bio->state); | ||
1405 | } else if (is_badblock(conf->mirrors[mirror].rdev, | ||
1406 | r1_bio->sector, | ||
1407 | r1_bio->sectors, | ||
1408 | &first_bad, &bad_sectors) && | ||
1409 | !is_badblock(conf->mirrors[r1_bio->read_disk].rdev, | ||
1410 | r1_bio->sector, | ||
1411 | r1_bio->sectors, | ||
1412 | &first_bad, &bad_sectors) | ||
1413 | ) | ||
1414 | set_bit(R1BIO_MadeGood, &r1_bio->state); | ||
1177 | 1415 | ||
1178 | update_head_pos(mirror, r1_bio); | 1416 | update_head_pos(mirror, r1_bio); |
1179 | 1417 | ||
1180 | if (atomic_dec_and_test(&r1_bio->remaining)) { | 1418 | if (atomic_dec_and_test(&r1_bio->remaining)) { |
1181 | sector_t s = r1_bio->sectors; | 1419 | int s = r1_bio->sectors; |
1182 | put_buf(r1_bio); | 1420 | if (test_bit(R1BIO_MadeGood, &r1_bio->state) || |
1183 | md_done_sync(mddev, s, uptodate); | 1421 | test_bit(R1BIO_WriteError, &r1_bio->state)) |
1422 | reschedule_retry(r1_bio); | ||
1423 | else { | ||
1424 | put_buf(r1_bio); | ||
1425 | md_done_sync(mddev, s, uptodate); | ||
1426 | } | ||
1184 | } | 1427 | } |
1185 | } | 1428 | } |
1186 | 1429 | ||
1430 | static int r1_sync_page_io(mdk_rdev_t *rdev, sector_t sector, | ||
1431 | int sectors, struct page *page, int rw) | ||
1432 | { | ||
1433 | if (sync_page_io(rdev, sector, sectors << 9, page, rw, false)) | ||
1434 | /* success */ | ||
1435 | return 1; | ||
1436 | if (rw == WRITE) | ||
1437 | set_bit(WriteErrorSeen, &rdev->flags); | ||
1438 | /* need to record an error - either for the block or the device */ | ||
1439 | if (!rdev_set_badblocks(rdev, sector, sectors, 0)) | ||
1440 | md_error(rdev->mddev, rdev); | ||
1441 | return 0; | ||
1442 | } | ||
1443 | |||
1187 | static int fix_sync_read_error(r1bio_t *r1_bio) | 1444 | static int fix_sync_read_error(r1bio_t *r1_bio) |
1188 | { | 1445 | { |
1189 | /* Try some synchronous reads of other devices to get | 1446 | /* Try some synchronous reads of other devices to get |
@@ -1193,6 +1450,9 @@ static int fix_sync_read_error(r1bio_t *r1_bio) | |||
1193 | * We don't need to freeze the array, because being in an | 1450 | * We don't need to freeze the array, because being in an |
1194 | * active sync request, there is no normal IO, and | 1451 | * active sync request, there is no normal IO, and |
1195 | * no overlapping syncs. | 1452 | * no overlapping syncs. |
1453 | * We don't need to check is_badblock() again as we | ||
1454 | * made sure that anything with a bad block in range | ||
1455 | * will have bi_end_io clear. | ||
1196 | */ | 1456 | */ |
1197 | mddev_t *mddev = r1_bio->mddev; | 1457 | mddev_t *mddev = r1_bio->mddev; |
1198 | conf_t *conf = mddev->private; | 1458 | conf_t *conf = mddev->private; |
@@ -1217,9 +1477,7 @@ static int fix_sync_read_error(r1bio_t *r1_bio) | |||
1217 | * active, and resync is currently active | 1477 | * active, and resync is currently active |
1218 | */ | 1478 | */ |
1219 | rdev = conf->mirrors[d].rdev; | 1479 | rdev = conf->mirrors[d].rdev; |
1220 | if (sync_page_io(rdev, | 1480 | if (sync_page_io(rdev, sect, s<<9, |
1221 | sect, | ||
1222 | s<<9, | ||
1223 | bio->bi_io_vec[idx].bv_page, | 1481 | bio->bi_io_vec[idx].bv_page, |
1224 | READ, false)) { | 1482 | READ, false)) { |
1225 | success = 1; | 1483 | success = 1; |
@@ -1233,16 +1491,36 @@ static int fix_sync_read_error(r1bio_t *r1_bio) | |||
1233 | 1491 | ||
1234 | if (!success) { | 1492 | if (!success) { |
1235 | char b[BDEVNAME_SIZE]; | 1493 | char b[BDEVNAME_SIZE]; |
1236 | /* Cannot read from anywhere, array is toast */ | 1494 | int abort = 0; |
1237 | md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); | 1495 | /* Cannot read from anywhere, this block is lost. |
1496 | * Record a bad block on each device. If that doesn't | ||
1497 | * work just disable and interrupt the recovery. | ||
1498 | * Don't fail devices as that won't really help. | ||
1499 | */ | ||
1238 | printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error" | 1500 | printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error" |
1239 | " for block %llu\n", | 1501 | " for block %llu\n", |
1240 | mdname(mddev), | 1502 | mdname(mddev), |
1241 | bdevname(bio->bi_bdev, b), | 1503 | bdevname(bio->bi_bdev, b), |
1242 | (unsigned long long)r1_bio->sector); | 1504 | (unsigned long long)r1_bio->sector); |
1243 | md_done_sync(mddev, r1_bio->sectors, 0); | 1505 | for (d = 0; d < conf->raid_disks; d++) { |
1244 | put_buf(r1_bio); | 1506 | rdev = conf->mirrors[d].rdev; |
1245 | return 0; | 1507 | if (!rdev || test_bit(Faulty, &rdev->flags)) |
1508 | continue; | ||
1509 | if (!rdev_set_badblocks(rdev, sect, s, 0)) | ||
1510 | abort = 1; | ||
1511 | } | ||
1512 | if (abort) { | ||
1513 | mddev->recovery_disabled = 1; | ||
1514 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | ||
1515 | md_done_sync(mddev, r1_bio->sectors, 0); | ||
1516 | put_buf(r1_bio); | ||
1517 | return 0; | ||
1518 | } | ||
1519 | /* Try next page */ | ||
1520 | sectors -= s; | ||
1521 | sect += s; | ||
1522 | idx++; | ||
1523 | continue; | ||
1246 | } | 1524 | } |
1247 | 1525 | ||
1248 | start = d; | 1526 | start = d; |
@@ -1254,16 +1532,12 @@ static int fix_sync_read_error(r1bio_t *r1_bio) | |||
1254 | if (r1_bio->bios[d]->bi_end_io != end_sync_read) | 1532 | if (r1_bio->bios[d]->bi_end_io != end_sync_read) |
1255 | continue; | 1533 | continue; |
1256 | rdev = conf->mirrors[d].rdev; | 1534 | rdev = conf->mirrors[d].rdev; |
1257 | if (sync_page_io(rdev, | 1535 | if (r1_sync_page_io(rdev, sect, s, |
1258 | sect, | 1536 | bio->bi_io_vec[idx].bv_page, |
1259 | s<<9, | 1537 | WRITE) == 0) { |
1260 | bio->bi_io_vec[idx].bv_page, | ||
1261 | WRITE, false) == 0) { | ||
1262 | r1_bio->bios[d]->bi_end_io = NULL; | 1538 | r1_bio->bios[d]->bi_end_io = NULL; |
1263 | rdev_dec_pending(rdev, mddev); | 1539 | rdev_dec_pending(rdev, mddev); |
1264 | md_error(mddev, rdev); | 1540 | } |
1265 | } else | ||
1266 | atomic_add(s, &rdev->corrected_errors); | ||
1267 | } | 1541 | } |
1268 | d = start; | 1542 | d = start; |
1269 | while (d != r1_bio->read_disk) { | 1543 | while (d != r1_bio->read_disk) { |
@@ -1273,12 +1547,10 @@ static int fix_sync_read_error(r1bio_t *r1_bio) | |||
1273 | if (r1_bio->bios[d]->bi_end_io != end_sync_read) | 1547 | if (r1_bio->bios[d]->bi_end_io != end_sync_read) |
1274 | continue; | 1548 | continue; |
1275 | rdev = conf->mirrors[d].rdev; | 1549 | rdev = conf->mirrors[d].rdev; |
1276 | if (sync_page_io(rdev, | 1550 | if (r1_sync_page_io(rdev, sect, s, |
1277 | sect, | 1551 | bio->bi_io_vec[idx].bv_page, |
1278 | s<<9, | 1552 | READ) != 0) |
1279 | bio->bi_io_vec[idx].bv_page, | 1553 | atomic_add(s, &rdev->corrected_errors); |
1280 | READ, false) == 0) | ||
1281 | md_error(mddev, rdev); | ||
1282 | } | 1554 | } |
1283 | sectors -= s; | 1555 | sectors -= s; |
1284 | sect += s; | 1556 | sect += s; |
@@ -1420,7 +1692,7 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) | |||
1420 | * | 1692 | * |
1421 | * 1. Retries failed read operations on working mirrors. | 1693 | * 1. Retries failed read operations on working mirrors. |
1422 | * 2. Updates the raid superblock when problems encounter. | 1694 | * 2. Updates the raid superblock when problems encounter. |
1423 | * 3. Performs writes following reads for array syncronising. | 1695 | * 3. Performs writes following reads for array synchronising. |
1424 | */ | 1696 | */ |
1425 | 1697 | ||
1426 | static void fix_read_error(conf_t *conf, int read_disk, | 1698 | static void fix_read_error(conf_t *conf, int read_disk, |
@@ -1443,9 +1715,14 @@ static void fix_read_error(conf_t *conf, int read_disk, | |||
1443 | * which is the thread that might remove | 1715 | * which is the thread that might remove |
1444 | * a device. If raid1d ever becomes multi-threaded.... | 1716 | * a device. If raid1d ever becomes multi-threaded.... |
1445 | */ | 1717 | */ |
1718 | sector_t first_bad; | ||
1719 | int bad_sectors; | ||
1720 | |||
1446 | rdev = conf->mirrors[d].rdev; | 1721 | rdev = conf->mirrors[d].rdev; |
1447 | if (rdev && | 1722 | if (rdev && |
1448 | test_bit(In_sync, &rdev->flags) && | 1723 | test_bit(In_sync, &rdev->flags) && |
1724 | is_badblock(rdev, sect, s, | ||
1725 | &first_bad, &bad_sectors) == 0 && | ||
1449 | sync_page_io(rdev, sect, s<<9, | 1726 | sync_page_io(rdev, sect, s<<9, |
1450 | conf->tmppage, READ, false)) | 1727 | conf->tmppage, READ, false)) |
1451 | success = 1; | 1728 | success = 1; |
@@ -1457,8 +1734,10 @@ static void fix_read_error(conf_t *conf, int read_disk, | |||
1457 | } while (!success && d != read_disk); | 1734 | } while (!success && d != read_disk); |
1458 | 1735 | ||
1459 | if (!success) { | 1736 | if (!success) { |
1460 | /* Cannot read from anywhere -- bye bye array */ | 1737 | /* Cannot read from anywhere - mark it bad */ |
1461 | md_error(mddev, conf->mirrors[read_disk].rdev); | 1738 | mdk_rdev_t *rdev = conf->mirrors[read_disk].rdev; |
1739 | if (!rdev_set_badblocks(rdev, sect, s, 0)) | ||
1740 | md_error(mddev, rdev); | ||
1462 | break; | 1741 | break; |
1463 | } | 1742 | } |
1464 | /* write it back and re-read */ | 1743 | /* write it back and re-read */ |
@@ -1469,13 +1748,9 @@ static void fix_read_error(conf_t *conf, int read_disk, | |||
1469 | d--; | 1748 | d--; |
1470 | rdev = conf->mirrors[d].rdev; | 1749 | rdev = conf->mirrors[d].rdev; |
1471 | if (rdev && | 1750 | if (rdev && |
1472 | test_bit(In_sync, &rdev->flags)) { | 1751 | test_bit(In_sync, &rdev->flags)) |
1473 | if (sync_page_io(rdev, sect, s<<9, | 1752 | r1_sync_page_io(rdev, sect, s, |
1474 | conf->tmppage, WRITE, false) | 1753 | conf->tmppage, WRITE); |
1475 | == 0) | ||
1476 | /* Well, this device is dead */ | ||
1477 | md_error(mddev, rdev); | ||
1478 | } | ||
1479 | } | 1754 | } |
1480 | d = start; | 1755 | d = start; |
1481 | while (d != read_disk) { | 1756 | while (d != read_disk) { |
@@ -1486,12 +1761,8 @@ static void fix_read_error(conf_t *conf, int read_disk, | |||
1486 | rdev = conf->mirrors[d].rdev; | 1761 | rdev = conf->mirrors[d].rdev; |
1487 | if (rdev && | 1762 | if (rdev && |
1488 | test_bit(In_sync, &rdev->flags)) { | 1763 | test_bit(In_sync, &rdev->flags)) { |
1489 | if (sync_page_io(rdev, sect, s<<9, | 1764 | if (r1_sync_page_io(rdev, sect, s, |
1490 | conf->tmppage, READ, false) | 1765 | conf->tmppage, READ)) { |
1491 | == 0) | ||
1492 | /* Well, this device is dead */ | ||
1493 | md_error(mddev, rdev); | ||
1494 | else { | ||
1495 | atomic_add(s, &rdev->corrected_errors); | 1766 | atomic_add(s, &rdev->corrected_errors); |
1496 | printk(KERN_INFO | 1767 | printk(KERN_INFO |
1497 | "md/raid1:%s: read error corrected " | 1768 | "md/raid1:%s: read error corrected " |
@@ -1508,21 +1779,255 @@ static void fix_read_error(conf_t *conf, int read_disk, | |||
1508 | } | 1779 | } |
1509 | } | 1780 | } |
1510 | 1781 | ||
1782 | static void bi_complete(struct bio *bio, int error) | ||
1783 | { | ||
1784 | complete((struct completion *)bio->bi_private); | ||
1785 | } | ||
1786 | |||
1787 | static int submit_bio_wait(int rw, struct bio *bio) | ||
1788 | { | ||
1789 | struct completion event; | ||
1790 | rw |= REQ_SYNC; | ||
1791 | |||
1792 | init_completion(&event); | ||
1793 | bio->bi_private = &event; | ||
1794 | bio->bi_end_io = bi_complete; | ||
1795 | submit_bio(rw, bio); | ||
1796 | wait_for_completion(&event); | ||
1797 | |||
1798 | return test_bit(BIO_UPTODATE, &bio->bi_flags); | ||
1799 | } | ||
1800 | |||
1801 | static int narrow_write_error(r1bio_t *r1_bio, int i) | ||
1802 | { | ||
1803 | mddev_t *mddev = r1_bio->mddev; | ||
1804 | conf_t *conf = mddev->private; | ||
1805 | mdk_rdev_t *rdev = conf->mirrors[i].rdev; | ||
1806 | int vcnt, idx; | ||
1807 | struct bio_vec *vec; | ||
1808 | |||
1809 | /* bio has the data to be written to device 'i' where | ||
1810 | * we just recently had a write error. | ||
1811 | * We repeatedly clone the bio and trim down to one block, | ||
1812 | * then try the write. Where the write fails we record | ||
1813 | * a bad block. | ||
1814 | * It is conceivable that the bio doesn't exactly align with | ||
1815 | * blocks. We must handle this somehow. | ||
1816 | * | ||
1817 | * We currently own a reference on the rdev. | ||
1818 | */ | ||
1819 | |||
1820 | int block_sectors; | ||
1821 | sector_t sector; | ||
1822 | int sectors; | ||
1823 | int sect_to_write = r1_bio->sectors; | ||
1824 | int ok = 1; | ||
1825 | |||
1826 | if (rdev->badblocks.shift < 0) | ||
1827 | return 0; | ||
1828 | |||
1829 | block_sectors = 1 << rdev->badblocks.shift; | ||
1830 | sector = r1_bio->sector; | ||
1831 | sectors = ((sector + block_sectors) | ||
1832 | & ~(sector_t)(block_sectors - 1)) | ||
1833 | - sector; | ||
1834 | |||
1835 | if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { | ||
1836 | vcnt = r1_bio->behind_page_count; | ||
1837 | vec = r1_bio->behind_bvecs; | ||
1838 | idx = 0; | ||
1839 | while (vec[idx].bv_page == NULL) | ||
1840 | idx++; | ||
1841 | } else { | ||
1842 | vcnt = r1_bio->master_bio->bi_vcnt; | ||
1843 | vec = r1_bio->master_bio->bi_io_vec; | ||
1844 | idx = r1_bio->master_bio->bi_idx; | ||
1845 | } | ||
1846 | while (sect_to_write) { | ||
1847 | struct bio *wbio; | ||
1848 | if (sectors > sect_to_write) | ||
1849 | sectors = sect_to_write; | ||
1850 | /* Write at 'sector' for 'sectors'*/ | ||
1851 | |||
1852 | wbio = bio_alloc_mddev(GFP_NOIO, vcnt, mddev); | ||
1853 | memcpy(wbio->bi_io_vec, vec, vcnt * sizeof(struct bio_vec)); | ||
1854 | wbio->bi_sector = r1_bio->sector; | ||
1855 | wbio->bi_rw = WRITE; | ||
1856 | wbio->bi_vcnt = vcnt; | ||
1857 | wbio->bi_size = r1_bio->sectors << 9; | ||
1858 | wbio->bi_idx = idx; | ||
1859 | |||
1860 | md_trim_bio(wbio, sector - r1_bio->sector, sectors); | ||
1861 | wbio->bi_sector += rdev->data_offset; | ||
1862 | wbio->bi_bdev = rdev->bdev; | ||
1863 | if (submit_bio_wait(WRITE, wbio) == 0) | ||
1864 | /* failure! */ | ||
1865 | ok = rdev_set_badblocks(rdev, sector, | ||
1866 | sectors, 0) | ||
1867 | && ok; | ||
1868 | |||
1869 | bio_put(wbio); | ||
1870 | sect_to_write -= sectors; | ||
1871 | sector += sectors; | ||
1872 | sectors = block_sectors; | ||
1873 | } | ||
1874 | return ok; | ||
1875 | } | ||
1876 | |||
1877 | static void handle_sync_write_finished(conf_t *conf, r1bio_t *r1_bio) | ||
1878 | { | ||
1879 | int m; | ||
1880 | int s = r1_bio->sectors; | ||
1881 | for (m = 0; m < conf->raid_disks ; m++) { | ||
1882 | mdk_rdev_t *rdev = conf->mirrors[m].rdev; | ||
1883 | struct bio *bio = r1_bio->bios[m]; | ||
1884 | if (bio->bi_end_io == NULL) | ||
1885 | continue; | ||
1886 | if (test_bit(BIO_UPTODATE, &bio->bi_flags) && | ||
1887 | test_bit(R1BIO_MadeGood, &r1_bio->state)) { | ||
1888 | rdev_clear_badblocks(rdev, r1_bio->sector, s); | ||
1889 | } | ||
1890 | if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && | ||
1891 | test_bit(R1BIO_WriteError, &r1_bio->state)) { | ||
1892 | if (!rdev_set_badblocks(rdev, r1_bio->sector, s, 0)) | ||
1893 | md_error(conf->mddev, rdev); | ||
1894 | } | ||
1895 | } | ||
1896 | put_buf(r1_bio); | ||
1897 | md_done_sync(conf->mddev, s, 1); | ||
1898 | } | ||
1899 | |||
1900 | static void handle_write_finished(conf_t *conf, r1bio_t *r1_bio) | ||
1901 | { | ||
1902 | int m; | ||
1903 | for (m = 0; m < conf->raid_disks ; m++) | ||
1904 | if (r1_bio->bios[m] == IO_MADE_GOOD) { | ||
1905 | mdk_rdev_t *rdev = conf->mirrors[m].rdev; | ||
1906 | rdev_clear_badblocks(rdev, | ||
1907 | r1_bio->sector, | ||
1908 | r1_bio->sectors); | ||
1909 | rdev_dec_pending(rdev, conf->mddev); | ||
1910 | } else if (r1_bio->bios[m] != NULL) { | ||
1911 | /* This drive got a write error. We need to | ||
1912 | * narrow down and record precise write | ||
1913 | * errors. | ||
1914 | */ | ||
1915 | if (!narrow_write_error(r1_bio, m)) { | ||
1916 | md_error(conf->mddev, | ||
1917 | conf->mirrors[m].rdev); | ||
1918 | /* an I/O failed, we can't clear the bitmap */ | ||
1919 | set_bit(R1BIO_Degraded, &r1_bio->state); | ||
1920 | } | ||
1921 | rdev_dec_pending(conf->mirrors[m].rdev, | ||
1922 | conf->mddev); | ||
1923 | } | ||
1924 | if (test_bit(R1BIO_WriteError, &r1_bio->state)) | ||
1925 | close_write(r1_bio); | ||
1926 | raid_end_bio_io(r1_bio); | ||
1927 | } | ||
1928 | |||
1929 | static void handle_read_error(conf_t *conf, r1bio_t *r1_bio) | ||
1930 | { | ||
1931 | int disk; | ||
1932 | int max_sectors; | ||
1933 | mddev_t *mddev = conf->mddev; | ||
1934 | struct bio *bio; | ||
1935 | char b[BDEVNAME_SIZE]; | ||
1936 | mdk_rdev_t *rdev; | ||
1937 | |||
1938 | clear_bit(R1BIO_ReadError, &r1_bio->state); | ||
1939 | /* we got a read error. Maybe the drive is bad. Maybe just | ||
1940 | * the block and we can fix it. | ||
1941 | * We freeze all other IO, and try reading the block from | ||
1942 | * other devices. When we find one, we re-write | ||
1943 | * and check it that fixes the read error. | ||
1944 | * This is all done synchronously while the array is | ||
1945 | * frozen | ||
1946 | */ | ||
1947 | if (mddev->ro == 0) { | ||
1948 | freeze_array(conf); | ||
1949 | fix_read_error(conf, r1_bio->read_disk, | ||
1950 | r1_bio->sector, r1_bio->sectors); | ||
1951 | unfreeze_array(conf); | ||
1952 | } else | ||
1953 | md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); | ||
1954 | |||
1955 | bio = r1_bio->bios[r1_bio->read_disk]; | ||
1956 | bdevname(bio->bi_bdev, b); | ||
1957 | read_more: | ||
1958 | disk = read_balance(conf, r1_bio, &max_sectors); | ||
1959 | if (disk == -1) { | ||
1960 | printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O" | ||
1961 | " read error for block %llu\n", | ||
1962 | mdname(mddev), b, (unsigned long long)r1_bio->sector); | ||
1963 | raid_end_bio_io(r1_bio); | ||
1964 | } else { | ||
1965 | const unsigned long do_sync | ||
1966 | = r1_bio->master_bio->bi_rw & REQ_SYNC; | ||
1967 | if (bio) { | ||
1968 | r1_bio->bios[r1_bio->read_disk] = | ||
1969 | mddev->ro ? IO_BLOCKED : NULL; | ||
1970 | bio_put(bio); | ||
1971 | } | ||
1972 | r1_bio->read_disk = disk; | ||
1973 | bio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev); | ||
1974 | md_trim_bio(bio, r1_bio->sector - bio->bi_sector, max_sectors); | ||
1975 | r1_bio->bios[r1_bio->read_disk] = bio; | ||
1976 | rdev = conf->mirrors[disk].rdev; | ||
1977 | printk_ratelimited(KERN_ERR | ||
1978 | "md/raid1:%s: redirecting sector %llu" | ||
1979 | " to other mirror: %s\n", | ||
1980 | mdname(mddev), | ||
1981 | (unsigned long long)r1_bio->sector, | ||
1982 | bdevname(rdev->bdev, b)); | ||
1983 | bio->bi_sector = r1_bio->sector + rdev->data_offset; | ||
1984 | bio->bi_bdev = rdev->bdev; | ||
1985 | bio->bi_end_io = raid1_end_read_request; | ||
1986 | bio->bi_rw = READ | do_sync; | ||
1987 | bio->bi_private = r1_bio; | ||
1988 | if (max_sectors < r1_bio->sectors) { | ||
1989 | /* Drat - have to split this up more */ | ||
1990 | struct bio *mbio = r1_bio->master_bio; | ||
1991 | int sectors_handled = (r1_bio->sector + max_sectors | ||
1992 | - mbio->bi_sector); | ||
1993 | r1_bio->sectors = max_sectors; | ||
1994 | spin_lock_irq(&conf->device_lock); | ||
1995 | if (mbio->bi_phys_segments == 0) | ||
1996 | mbio->bi_phys_segments = 2; | ||
1997 | else | ||
1998 | mbio->bi_phys_segments++; | ||
1999 | spin_unlock_irq(&conf->device_lock); | ||
2000 | generic_make_request(bio); | ||
2001 | bio = NULL; | ||
2002 | |||
2003 | r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); | ||
2004 | |||
2005 | r1_bio->master_bio = mbio; | ||
2006 | r1_bio->sectors = (mbio->bi_size >> 9) | ||
2007 | - sectors_handled; | ||
2008 | r1_bio->state = 0; | ||
2009 | set_bit(R1BIO_ReadError, &r1_bio->state); | ||
2010 | r1_bio->mddev = mddev; | ||
2011 | r1_bio->sector = mbio->bi_sector + sectors_handled; | ||
2012 | |||
2013 | goto read_more; | ||
2014 | } else | ||
2015 | generic_make_request(bio); | ||
2016 | } | ||
2017 | } | ||
2018 | |||
1511 | static void raid1d(mddev_t *mddev) | 2019 | static void raid1d(mddev_t *mddev) |
1512 | { | 2020 | { |
1513 | r1bio_t *r1_bio; | 2021 | r1bio_t *r1_bio; |
1514 | struct bio *bio; | ||
1515 | unsigned long flags; | 2022 | unsigned long flags; |
1516 | conf_t *conf = mddev->private; | 2023 | conf_t *conf = mddev->private; |
1517 | struct list_head *head = &conf->retry_list; | 2024 | struct list_head *head = &conf->retry_list; |
1518 | mdk_rdev_t *rdev; | ||
1519 | struct blk_plug plug; | 2025 | struct blk_plug plug; |
1520 | 2026 | ||
1521 | md_check_recovery(mddev); | 2027 | md_check_recovery(mddev); |
1522 | 2028 | ||
1523 | blk_start_plug(&plug); | 2029 | blk_start_plug(&plug); |
1524 | for (;;) { | 2030 | for (;;) { |
1525 | char b[BDEVNAME_SIZE]; | ||
1526 | 2031 | ||
1527 | if (atomic_read(&mddev->plug_cnt) == 0) | 2032 | if (atomic_read(&mddev->plug_cnt) == 0) |
1528 | flush_pending_writes(conf); | 2033 | flush_pending_writes(conf); |
@@ -1539,62 +2044,26 @@ static void raid1d(mddev_t *mddev) | |||
1539 | 2044 | ||
1540 | mddev = r1_bio->mddev; | 2045 | mddev = r1_bio->mddev; |
1541 | conf = mddev->private; | 2046 | conf = mddev->private; |
1542 | if (test_bit(R1BIO_IsSync, &r1_bio->state)) | 2047 | if (test_bit(R1BIO_IsSync, &r1_bio->state)) { |
1543 | sync_request_write(mddev, r1_bio); | 2048 | if (test_bit(R1BIO_MadeGood, &r1_bio->state) || |
1544 | else { | 2049 | test_bit(R1BIO_WriteError, &r1_bio->state)) |
1545 | int disk; | 2050 | handle_sync_write_finished(conf, r1_bio); |
1546 | 2051 | else | |
1547 | /* we got a read error. Maybe the drive is bad. Maybe just | 2052 | sync_request_write(mddev, r1_bio); |
1548 | * the block and we can fix it. | 2053 | } else if (test_bit(R1BIO_MadeGood, &r1_bio->state) || |
1549 | * We freeze all other IO, and try reading the block from | 2054 | test_bit(R1BIO_WriteError, &r1_bio->state)) |
1550 | * other devices. When we find one, we re-write | 2055 | handle_write_finished(conf, r1_bio); |
1551 | * and check it that fixes the read error. | 2056 | else if (test_bit(R1BIO_ReadError, &r1_bio->state)) |
1552 | * This is all done synchronously while the array is | 2057 | handle_read_error(conf, r1_bio); |
1553 | * frozen | 2058 | else |
2059 | /* just a partial read to be scheduled from separate | ||
2060 | * context | ||
1554 | */ | 2061 | */ |
1555 | if (mddev->ro == 0) { | 2062 | generic_make_request(r1_bio->bios[r1_bio->read_disk]); |
1556 | freeze_array(conf); | 2063 | |
1557 | fix_read_error(conf, r1_bio->read_disk, | ||
1558 | r1_bio->sector, | ||
1559 | r1_bio->sectors); | ||
1560 | unfreeze_array(conf); | ||
1561 | } else | ||
1562 | md_error(mddev, | ||
1563 | conf->mirrors[r1_bio->read_disk].rdev); | ||
1564 | |||
1565 | bio = r1_bio->bios[r1_bio->read_disk]; | ||
1566 | if ((disk=read_balance(conf, r1_bio)) == -1) { | ||
1567 | printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O" | ||
1568 | " read error for block %llu\n", | ||
1569 | mdname(mddev), | ||
1570 | bdevname(bio->bi_bdev,b), | ||
1571 | (unsigned long long)r1_bio->sector); | ||
1572 | raid_end_bio_io(r1_bio); | ||
1573 | } else { | ||
1574 | const unsigned long do_sync = r1_bio->master_bio->bi_rw & REQ_SYNC; | ||
1575 | r1_bio->bios[r1_bio->read_disk] = | ||
1576 | mddev->ro ? IO_BLOCKED : NULL; | ||
1577 | r1_bio->read_disk = disk; | ||
1578 | bio_put(bio); | ||
1579 | bio = bio_clone_mddev(r1_bio->master_bio, | ||
1580 | GFP_NOIO, mddev); | ||
1581 | r1_bio->bios[r1_bio->read_disk] = bio; | ||
1582 | rdev = conf->mirrors[disk].rdev; | ||
1583 | if (printk_ratelimit()) | ||
1584 | printk(KERN_ERR "md/raid1:%s: redirecting sector %llu to" | ||
1585 | " other mirror: %s\n", | ||
1586 | mdname(mddev), | ||
1587 | (unsigned long long)r1_bio->sector, | ||
1588 | bdevname(rdev->bdev,b)); | ||
1589 | bio->bi_sector = r1_bio->sector + rdev->data_offset; | ||
1590 | bio->bi_bdev = rdev->bdev; | ||
1591 | bio->bi_end_io = raid1_end_read_request; | ||
1592 | bio->bi_rw = READ | do_sync; | ||
1593 | bio->bi_private = r1_bio; | ||
1594 | generic_make_request(bio); | ||
1595 | } | ||
1596 | } | ||
1597 | cond_resched(); | 2064 | cond_resched(); |
2065 | if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) | ||
2066 | md_check_recovery(mddev); | ||
1598 | } | 2067 | } |
1599 | blk_finish_plug(&plug); | 2068 | blk_finish_plug(&plug); |
1600 | } | 2069 | } |
@@ -1636,6 +2105,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
1636 | int write_targets = 0, read_targets = 0; | 2105 | int write_targets = 0, read_targets = 0; |
1637 | sector_t sync_blocks; | 2106 | sector_t sync_blocks; |
1638 | int still_degraded = 0; | 2107 | int still_degraded = 0; |
2108 | int good_sectors = RESYNC_SECTORS; | ||
2109 | int min_bad = 0; /* number of sectors that are bad in all devices */ | ||
1639 | 2110 | ||
1640 | if (!conf->r1buf_pool) | 2111 | if (!conf->r1buf_pool) |
1641 | if (init_resync(conf)) | 2112 | if (init_resync(conf)) |
@@ -1723,36 +2194,89 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
1723 | 2194 | ||
1724 | rdev = rcu_dereference(conf->mirrors[i].rdev); | 2195 | rdev = rcu_dereference(conf->mirrors[i].rdev); |
1725 | if (rdev == NULL || | 2196 | if (rdev == NULL || |
1726 | test_bit(Faulty, &rdev->flags)) { | 2197 | test_bit(Faulty, &rdev->flags)) { |
1727 | still_degraded = 1; | 2198 | still_degraded = 1; |
1728 | continue; | ||
1729 | } else if (!test_bit(In_sync, &rdev->flags)) { | 2199 | } else if (!test_bit(In_sync, &rdev->flags)) { |
1730 | bio->bi_rw = WRITE; | 2200 | bio->bi_rw = WRITE; |
1731 | bio->bi_end_io = end_sync_write; | 2201 | bio->bi_end_io = end_sync_write; |
1732 | write_targets ++; | 2202 | write_targets ++; |
1733 | } else { | 2203 | } else { |
1734 | /* may need to read from here */ | 2204 | /* may need to read from here */ |
1735 | bio->bi_rw = READ; | 2205 | sector_t first_bad = MaxSector; |
1736 | bio->bi_end_io = end_sync_read; | 2206 | int bad_sectors; |
1737 | if (test_bit(WriteMostly, &rdev->flags)) { | 2207 | |
1738 | if (wonly < 0) | 2208 | if (is_badblock(rdev, sector_nr, good_sectors, |
1739 | wonly = i; | 2209 | &first_bad, &bad_sectors)) { |
1740 | } else { | 2210 | if (first_bad > sector_nr) |
1741 | if (disk < 0) | 2211 | good_sectors = first_bad - sector_nr; |
1742 | disk = i; | 2212 | else { |
2213 | bad_sectors -= (sector_nr - first_bad); | ||
2214 | if (min_bad == 0 || | ||
2215 | min_bad > bad_sectors) | ||
2216 | min_bad = bad_sectors; | ||
2217 | } | ||
2218 | } | ||
2219 | if (sector_nr < first_bad) { | ||
2220 | if (test_bit(WriteMostly, &rdev->flags)) { | ||
2221 | if (wonly < 0) | ||
2222 | wonly = i; | ||
2223 | } else { | ||
2224 | if (disk < 0) | ||
2225 | disk = i; | ||
2226 | } | ||
2227 | bio->bi_rw = READ; | ||
2228 | bio->bi_end_io = end_sync_read; | ||
2229 | read_targets++; | ||
1743 | } | 2230 | } |
1744 | read_targets++; | ||
1745 | } | 2231 | } |
1746 | atomic_inc(&rdev->nr_pending); | 2232 | if (bio->bi_end_io) { |
1747 | bio->bi_sector = sector_nr + rdev->data_offset; | 2233 | atomic_inc(&rdev->nr_pending); |
1748 | bio->bi_bdev = rdev->bdev; | 2234 | bio->bi_sector = sector_nr + rdev->data_offset; |
1749 | bio->bi_private = r1_bio; | 2235 | bio->bi_bdev = rdev->bdev; |
2236 | bio->bi_private = r1_bio; | ||
2237 | } | ||
1750 | } | 2238 | } |
1751 | rcu_read_unlock(); | 2239 | rcu_read_unlock(); |
1752 | if (disk < 0) | 2240 | if (disk < 0) |
1753 | disk = wonly; | 2241 | disk = wonly; |
1754 | r1_bio->read_disk = disk; | 2242 | r1_bio->read_disk = disk; |
1755 | 2243 | ||
2244 | if (read_targets == 0 && min_bad > 0) { | ||
2245 | /* These sectors are bad on all InSync devices, so we | ||
2246 | * need to mark them bad on all write targets | ||
2247 | */ | ||
2248 | int ok = 1; | ||
2249 | for (i = 0 ; i < conf->raid_disks ; i++) | ||
2250 | if (r1_bio->bios[i]->bi_end_io == end_sync_write) { | ||
2251 | mdk_rdev_t *rdev = | ||
2252 | rcu_dereference(conf->mirrors[i].rdev); | ||
2253 | ok = rdev_set_badblocks(rdev, sector_nr, | ||
2254 | min_bad, 0 | ||
2255 | ) && ok; | ||
2256 | } | ||
2257 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | ||
2258 | *skipped = 1; | ||
2259 | put_buf(r1_bio); | ||
2260 | |||
2261 | if (!ok) { | ||
2262 | /* Cannot record the badblocks, so need to | ||
2263 | * abort the resync. | ||
2264 | * If there are multiple read targets, could just | ||
2265 | * fail the really bad ones ??? | ||
2266 | */ | ||
2267 | conf->recovery_disabled = mddev->recovery_disabled; | ||
2268 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | ||
2269 | return 0; | ||
2270 | } else | ||
2271 | return min_bad; | ||
2272 | |||
2273 | } | ||
2274 | if (min_bad > 0 && min_bad < good_sectors) { | ||
2275 | /* only resync enough to reach the next bad->good | ||
2276 | * transition */ | ||
2277 | good_sectors = min_bad; | ||
2278 | } | ||
2279 | |||
1756 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && read_targets > 0) | 2280 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && read_targets > 0) |
1757 | /* extra read targets are also write targets */ | 2281 | /* extra read targets are also write targets */ |
1758 | write_targets += read_targets-1; | 2282 | write_targets += read_targets-1; |
@@ -1769,6 +2293,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
1769 | 2293 | ||
1770 | if (max_sector > mddev->resync_max) | 2294 | if (max_sector > mddev->resync_max) |
1771 | max_sector = mddev->resync_max; /* Don't do IO beyond here */ | 2295 | max_sector = mddev->resync_max; /* Don't do IO beyond here */ |
2296 | if (max_sector > sector_nr + good_sectors) | ||
2297 | max_sector = sector_nr + good_sectors; | ||
1772 | nr_sectors = 0; | 2298 | nr_sectors = 0; |
1773 | sync_blocks = 0; | 2299 | sync_blocks = 0; |
1774 | do { | 2300 | do { |
@@ -2045,8 +2571,7 @@ static int stop(mddev_t *mddev) | |||
2045 | raise_barrier(conf); | 2571 | raise_barrier(conf); |
2046 | lower_barrier(conf); | 2572 | lower_barrier(conf); |
2047 | 2573 | ||
2048 | md_unregister_thread(mddev->thread); | 2574 | md_unregister_thread(&mddev->thread); |
2049 | mddev->thread = NULL; | ||
2050 | if (conf->r1bio_pool) | 2575 | if (conf->r1bio_pool) |
2051 | mempool_destroy(conf->r1bio_pool); | 2576 | mempool_destroy(conf->r1bio_pool); |
2052 | kfree(conf->mirrors); | 2577 | kfree(conf->mirrors); |
@@ -2154,18 +2679,13 @@ static int raid1_reshape(mddev_t *mddev) | |||
2154 | for (d = d2 = 0; d < conf->raid_disks; d++) { | 2679 | for (d = d2 = 0; d < conf->raid_disks; d++) { |
2155 | mdk_rdev_t *rdev = conf->mirrors[d].rdev; | 2680 | mdk_rdev_t *rdev = conf->mirrors[d].rdev; |
2156 | if (rdev && rdev->raid_disk != d2) { | 2681 | if (rdev && rdev->raid_disk != d2) { |
2157 | char nm[20]; | 2682 | sysfs_unlink_rdev(mddev, rdev); |
2158 | sprintf(nm, "rd%d", rdev->raid_disk); | ||
2159 | sysfs_remove_link(&mddev->kobj, nm); | ||
2160 | rdev->raid_disk = d2; | 2683 | rdev->raid_disk = d2; |
2161 | sprintf(nm, "rd%d", rdev->raid_disk); | 2684 | sysfs_unlink_rdev(mddev, rdev); |
2162 | sysfs_remove_link(&mddev->kobj, nm); | 2685 | if (sysfs_link_rdev(mddev, rdev)) |
2163 | if (sysfs_create_link(&mddev->kobj, | ||
2164 | &rdev->kobj, nm)) | ||
2165 | printk(KERN_WARNING | 2686 | printk(KERN_WARNING |
2166 | "md/raid1:%s: cannot register " | 2687 | "md/raid1:%s: cannot register rd%d\n", |
2167 | "%s\n", | 2688 | mdname(mddev), rdev->raid_disk); |
2168 | mdname(mddev), nm); | ||
2169 | } | 2689 | } |
2170 | if (rdev) | 2690 | if (rdev) |
2171 | newmirrors[d2++].rdev = rdev; | 2691 | newmirrors[d2++].rdev = rdev; |
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index e743a64fac4..e0d676b4897 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h | |||
@@ -48,6 +48,12 @@ struct r1_private_data_s { | |||
48 | * (fresh device added). | 48 | * (fresh device added). |
49 | * Cleared when a sync completes. | 49 | * Cleared when a sync completes. |
50 | */ | 50 | */ |
51 | int recovery_disabled; /* when the same as | ||
52 | * mddev->recovery_disabled | ||
53 | * we don't allow recovery | ||
54 | * to be attempted as we | ||
55 | * expect a read error | ||
56 | */ | ||
51 | 57 | ||
52 | wait_queue_head_t wait_barrier; | 58 | wait_queue_head_t wait_barrier; |
53 | 59 | ||
@@ -95,7 +101,7 @@ struct r1bio_s { | |||
95 | 101 | ||
96 | struct list_head retry_list; | 102 | struct list_head retry_list; |
97 | /* Next two are only valid when R1BIO_BehindIO is set */ | 103 | /* Next two are only valid when R1BIO_BehindIO is set */ |
98 | struct page **behind_pages; | 104 | struct bio_vec *behind_bvecs; |
99 | int behind_page_count; | 105 | int behind_page_count; |
100 | /* | 106 | /* |
101 | * if the IO is in WRITE direction, then multiple bios are used. | 107 | * if the IO is in WRITE direction, then multiple bios are used. |
@@ -110,13 +116,24 @@ struct r1bio_s { | |||
110 | * correct the read error. To keep track of bad blocks on a per-bio | 116 | * correct the read error. To keep track of bad blocks on a per-bio |
111 | * level, we store IO_BLOCKED in the appropriate 'bios' pointer | 117 | * level, we store IO_BLOCKED in the appropriate 'bios' pointer |
112 | */ | 118 | */ |
113 | #define IO_BLOCKED ((struct bio*)1) | 119 | #define IO_BLOCKED ((struct bio *)1) |
120 | /* When we successfully write to a known bad-block, we need to remove the | ||
121 | * bad-block marking which must be done from process context. So we record | ||
122 | * the success by setting bios[n] to IO_MADE_GOOD | ||
123 | */ | ||
124 | #define IO_MADE_GOOD ((struct bio *)2) | ||
125 | |||
126 | #define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) | ||
114 | 127 | ||
115 | /* bits for r1bio.state */ | 128 | /* bits for r1bio.state */ |
116 | #define R1BIO_Uptodate 0 | 129 | #define R1BIO_Uptodate 0 |
117 | #define R1BIO_IsSync 1 | 130 | #define R1BIO_IsSync 1 |
118 | #define R1BIO_Degraded 2 | 131 | #define R1BIO_Degraded 2 |
119 | #define R1BIO_BehindIO 3 | 132 | #define R1BIO_BehindIO 3 |
133 | /* Set ReadError on bios that experience a readerror so that | ||
134 | * raid1d knows what to do with them. | ||
135 | */ | ||
136 | #define R1BIO_ReadError 4 | ||
120 | /* For write-behind requests, we call bi_end_io when | 137 | /* For write-behind requests, we call bi_end_io when |
121 | * the last non-write-behind device completes, providing | 138 | * the last non-write-behind device completes, providing |
122 | * any write was successful. Otherwise we call when | 139 | * any write was successful. Otherwise we call when |
@@ -125,6 +142,11 @@ struct r1bio_s { | |||
125 | * Record that bi_end_io was called with this flag... | 142 | * Record that bi_end_io was called with this flag... |
126 | */ | 143 | */ |
127 | #define R1BIO_Returned 6 | 144 | #define R1BIO_Returned 6 |
145 | /* If a write for this request means we can clear some | ||
146 | * known-bad-block records, we set this flag | ||
147 | */ | ||
148 | #define R1BIO_MadeGood 7 | ||
149 | #define R1BIO_WriteError 8 | ||
128 | 150 | ||
129 | extern int md_raid1_congested(mddev_t *mddev, int bits); | 151 | extern int md_raid1_congested(mddev_t *mddev, int bits); |
130 | 152 | ||
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 6e846688962..1d44228530a 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
@@ -22,6 +22,7 @@ | |||
22 | #include <linux/delay.h> | 22 | #include <linux/delay.h> |
23 | #include <linux/blkdev.h> | 23 | #include <linux/blkdev.h> |
24 | #include <linux/seq_file.h> | 24 | #include <linux/seq_file.h> |
25 | #include <linux/ratelimit.h> | ||
25 | #include "md.h" | 26 | #include "md.h" |
26 | #include "raid10.h" | 27 | #include "raid10.h" |
27 | #include "raid0.h" | 28 | #include "raid0.h" |
@@ -123,7 +124,14 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) | |||
123 | for (j = 0 ; j < nalloc; j++) { | 124 | for (j = 0 ; j < nalloc; j++) { |
124 | bio = r10_bio->devs[j].bio; | 125 | bio = r10_bio->devs[j].bio; |
125 | for (i = 0; i < RESYNC_PAGES; i++) { | 126 | for (i = 0; i < RESYNC_PAGES; i++) { |
126 | page = alloc_page(gfp_flags); | 127 | if (j == 1 && !test_bit(MD_RECOVERY_SYNC, |
128 | &conf->mddev->recovery)) { | ||
129 | /* we can share bv_page's during recovery */ | ||
130 | struct bio *rbio = r10_bio->devs[0].bio; | ||
131 | page = rbio->bi_io_vec[i].bv_page; | ||
132 | get_page(page); | ||
133 | } else | ||
134 | page = alloc_page(gfp_flags); | ||
127 | if (unlikely(!page)) | 135 | if (unlikely(!page)) |
128 | goto out_free_pages; | 136 | goto out_free_pages; |
129 | 137 | ||
@@ -173,7 +181,7 @@ static void put_all_bios(conf_t *conf, r10bio_t *r10_bio) | |||
173 | 181 | ||
174 | for (i = 0; i < conf->copies; i++) { | 182 | for (i = 0; i < conf->copies; i++) { |
175 | struct bio **bio = & r10_bio->devs[i].bio; | 183 | struct bio **bio = & r10_bio->devs[i].bio; |
176 | if (*bio && *bio != IO_BLOCKED) | 184 | if (!BIO_SPECIAL(*bio)) |
177 | bio_put(*bio); | 185 | bio_put(*bio); |
178 | *bio = NULL; | 186 | *bio = NULL; |
179 | } | 187 | } |
@@ -183,12 +191,6 @@ static void free_r10bio(r10bio_t *r10_bio) | |||
183 | { | 191 | { |
184 | conf_t *conf = r10_bio->mddev->private; | 192 | conf_t *conf = r10_bio->mddev->private; |
185 | 193 | ||
186 | /* | ||
187 | * Wake up any possible resync thread that waits for the device | ||
188 | * to go idle. | ||
189 | */ | ||
190 | allow_barrier(conf); | ||
191 | |||
192 | put_all_bios(conf, r10_bio); | 194 | put_all_bios(conf, r10_bio); |
193 | mempool_free(r10_bio, conf->r10bio_pool); | 195 | mempool_free(r10_bio, conf->r10bio_pool); |
194 | } | 196 | } |
@@ -227,9 +229,27 @@ static void reschedule_retry(r10bio_t *r10_bio) | |||
227 | static void raid_end_bio_io(r10bio_t *r10_bio) | 229 | static void raid_end_bio_io(r10bio_t *r10_bio) |
228 | { | 230 | { |
229 | struct bio *bio = r10_bio->master_bio; | 231 | struct bio *bio = r10_bio->master_bio; |
232 | int done; | ||
233 | conf_t *conf = r10_bio->mddev->private; | ||
230 | 234 | ||
231 | bio_endio(bio, | 235 | if (bio->bi_phys_segments) { |
232 | test_bit(R10BIO_Uptodate, &r10_bio->state) ? 0 : -EIO); | 236 | unsigned long flags; |
237 | spin_lock_irqsave(&conf->device_lock, flags); | ||
238 | bio->bi_phys_segments--; | ||
239 | done = (bio->bi_phys_segments == 0); | ||
240 | spin_unlock_irqrestore(&conf->device_lock, flags); | ||
241 | } else | ||
242 | done = 1; | ||
243 | if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) | ||
244 | clear_bit(BIO_UPTODATE, &bio->bi_flags); | ||
245 | if (done) { | ||
246 | bio_endio(bio, 0); | ||
247 | /* | ||
248 | * Wake up any possible resync thread that waits for the device | ||
249 | * to go idle. | ||
250 | */ | ||
251 | allow_barrier(conf); | ||
252 | } | ||
233 | free_r10bio(r10_bio); | 253 | free_r10bio(r10_bio); |
234 | } | 254 | } |
235 | 255 | ||
@@ -244,6 +264,26 @@ static inline void update_head_pos(int slot, r10bio_t *r10_bio) | |||
244 | r10_bio->devs[slot].addr + (r10_bio->sectors); | 264 | r10_bio->devs[slot].addr + (r10_bio->sectors); |
245 | } | 265 | } |
246 | 266 | ||
267 | /* | ||
268 | * Find the disk number which triggered given bio | ||
269 | */ | ||
270 | static int find_bio_disk(conf_t *conf, r10bio_t *r10_bio, | ||
271 | struct bio *bio, int *slotp) | ||
272 | { | ||
273 | int slot; | ||
274 | |||
275 | for (slot = 0; slot < conf->copies; slot++) | ||
276 | if (r10_bio->devs[slot].bio == bio) | ||
277 | break; | ||
278 | |||
279 | BUG_ON(slot == conf->copies); | ||
280 | update_head_pos(slot, r10_bio); | ||
281 | |||
282 | if (slotp) | ||
283 | *slotp = slot; | ||
284 | return r10_bio->devs[slot].devnum; | ||
285 | } | ||
286 | |||
247 | static void raid10_end_read_request(struct bio *bio, int error) | 287 | static void raid10_end_read_request(struct bio *bio, int error) |
248 | { | 288 | { |
249 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | 289 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); |
@@ -277,34 +317,60 @@ static void raid10_end_read_request(struct bio *bio, int error) | |||
277 | * oops, read error - keep the refcount on the rdev | 317 | * oops, read error - keep the refcount on the rdev |
278 | */ | 318 | */ |
279 | char b[BDEVNAME_SIZE]; | 319 | char b[BDEVNAME_SIZE]; |
280 | if (printk_ratelimit()) | 320 | printk_ratelimited(KERN_ERR |
281 | printk(KERN_ERR "md/raid10:%s: %s: rescheduling sector %llu\n", | 321 | "md/raid10:%s: %s: rescheduling sector %llu\n", |
282 | mdname(conf->mddev), | 322 | mdname(conf->mddev), |
283 | bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector); | 323 | bdevname(conf->mirrors[dev].rdev->bdev, b), |
324 | (unsigned long long)r10_bio->sector); | ||
325 | set_bit(R10BIO_ReadError, &r10_bio->state); | ||
284 | reschedule_retry(r10_bio); | 326 | reschedule_retry(r10_bio); |
285 | } | 327 | } |
286 | } | 328 | } |
287 | 329 | ||
330 | static void close_write(r10bio_t *r10_bio) | ||
331 | { | ||
332 | /* clear the bitmap if all writes complete successfully */ | ||
333 | bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector, | ||
334 | r10_bio->sectors, | ||
335 | !test_bit(R10BIO_Degraded, &r10_bio->state), | ||
336 | 0); | ||
337 | md_write_end(r10_bio->mddev); | ||
338 | } | ||
339 | |||
340 | static void one_write_done(r10bio_t *r10_bio) | ||
341 | { | ||
342 | if (atomic_dec_and_test(&r10_bio->remaining)) { | ||
343 | if (test_bit(R10BIO_WriteError, &r10_bio->state)) | ||
344 | reschedule_retry(r10_bio); | ||
345 | else { | ||
346 | close_write(r10_bio); | ||
347 | if (test_bit(R10BIO_MadeGood, &r10_bio->state)) | ||
348 | reschedule_retry(r10_bio); | ||
349 | else | ||
350 | raid_end_bio_io(r10_bio); | ||
351 | } | ||
352 | } | ||
353 | } | ||
354 | |||
288 | static void raid10_end_write_request(struct bio *bio, int error) | 355 | static void raid10_end_write_request(struct bio *bio, int error) |
289 | { | 356 | { |
290 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | 357 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); |
291 | r10bio_t *r10_bio = bio->bi_private; | 358 | r10bio_t *r10_bio = bio->bi_private; |
292 | int slot, dev; | 359 | int dev; |
360 | int dec_rdev = 1; | ||
293 | conf_t *conf = r10_bio->mddev->private; | 361 | conf_t *conf = r10_bio->mddev->private; |
362 | int slot; | ||
294 | 363 | ||
295 | for (slot = 0; slot < conf->copies; slot++) | 364 | dev = find_bio_disk(conf, r10_bio, bio, &slot); |
296 | if (r10_bio->devs[slot].bio == bio) | ||
297 | break; | ||
298 | dev = r10_bio->devs[slot].devnum; | ||
299 | 365 | ||
300 | /* | 366 | /* |
301 | * this branch is our 'one mirror IO has finished' event handler: | 367 | * this branch is our 'one mirror IO has finished' event handler: |
302 | */ | 368 | */ |
303 | if (!uptodate) { | 369 | if (!uptodate) { |
304 | md_error(r10_bio->mddev, conf->mirrors[dev].rdev); | 370 | set_bit(WriteErrorSeen, &conf->mirrors[dev].rdev->flags); |
305 | /* an I/O failed, we can't clear the bitmap */ | 371 | set_bit(R10BIO_WriteError, &r10_bio->state); |
306 | set_bit(R10BIO_Degraded, &r10_bio->state); | 372 | dec_rdev = 0; |
307 | } else | 373 | } else { |
308 | /* | 374 | /* |
309 | * Set R10BIO_Uptodate in our master bio, so that | 375 | * Set R10BIO_Uptodate in our master bio, so that |
310 | * we will return a good error code for to the higher | 376 | * we will return a good error code for to the higher |
@@ -314,26 +380,31 @@ static void raid10_end_write_request(struct bio *bio, int error) | |||
314 | * user-side. So if something waits for IO, then it will | 380 | * user-side. So if something waits for IO, then it will |
315 | * wait for the 'master' bio. | 381 | * wait for the 'master' bio. |
316 | */ | 382 | */ |
383 | sector_t first_bad; | ||
384 | int bad_sectors; | ||
385 | |||
317 | set_bit(R10BIO_Uptodate, &r10_bio->state); | 386 | set_bit(R10BIO_Uptodate, &r10_bio->state); |
318 | 387 | ||
319 | update_head_pos(slot, r10_bio); | 388 | /* Maybe we can clear some bad blocks. */ |
389 | if (is_badblock(conf->mirrors[dev].rdev, | ||
390 | r10_bio->devs[slot].addr, | ||
391 | r10_bio->sectors, | ||
392 | &first_bad, &bad_sectors)) { | ||
393 | bio_put(bio); | ||
394 | r10_bio->devs[slot].bio = IO_MADE_GOOD; | ||
395 | dec_rdev = 0; | ||
396 | set_bit(R10BIO_MadeGood, &r10_bio->state); | ||
397 | } | ||
398 | } | ||
320 | 399 | ||
321 | /* | 400 | /* |
322 | * | 401 | * |
323 | * Let's see if all mirrored write operations have finished | 402 | * Let's see if all mirrored write operations have finished |
324 | * already. | 403 | * already. |
325 | */ | 404 | */ |
326 | if (atomic_dec_and_test(&r10_bio->remaining)) { | 405 | one_write_done(r10_bio); |
327 | /* clear the bitmap if all writes complete successfully */ | 406 | if (dec_rdev) |
328 | bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector, | 407 | rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); |
329 | r10_bio->sectors, | ||
330 | !test_bit(R10BIO_Degraded, &r10_bio->state), | ||
331 | 0); | ||
332 | md_write_end(r10_bio->mddev); | ||
333 | raid_end_bio_io(r10_bio); | ||
334 | } | ||
335 | |||
336 | rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); | ||
337 | } | 408 | } |
338 | 409 | ||
339 | 410 | ||
@@ -484,11 +555,12 @@ static int raid10_mergeable_bvec(struct request_queue *q, | |||
484 | * FIXME: possibly should rethink readbalancing and do it differently | 555 | * FIXME: possibly should rethink readbalancing and do it differently |
485 | * depending on near_copies / far_copies geometry. | 556 | * depending on near_copies / far_copies geometry. |
486 | */ | 557 | */ |
487 | static int read_balance(conf_t *conf, r10bio_t *r10_bio) | 558 | static int read_balance(conf_t *conf, r10bio_t *r10_bio, int *max_sectors) |
488 | { | 559 | { |
489 | const sector_t this_sector = r10_bio->sector; | 560 | const sector_t this_sector = r10_bio->sector; |
490 | int disk, slot; | 561 | int disk, slot; |
491 | const int sectors = r10_bio->sectors; | 562 | int sectors = r10_bio->sectors; |
563 | int best_good_sectors; | ||
492 | sector_t new_distance, best_dist; | 564 | sector_t new_distance, best_dist; |
493 | mdk_rdev_t *rdev; | 565 | mdk_rdev_t *rdev; |
494 | int do_balance; | 566 | int do_balance; |
@@ -497,8 +569,10 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio) | |||
497 | raid10_find_phys(conf, r10_bio); | 569 | raid10_find_phys(conf, r10_bio); |
498 | rcu_read_lock(); | 570 | rcu_read_lock(); |
499 | retry: | 571 | retry: |
572 | sectors = r10_bio->sectors; | ||
500 | best_slot = -1; | 573 | best_slot = -1; |
501 | best_dist = MaxSector; | 574 | best_dist = MaxSector; |
575 | best_good_sectors = 0; | ||
502 | do_balance = 1; | 576 | do_balance = 1; |
503 | /* | 577 | /* |
504 | * Check if we can balance. We can balance on the whole | 578 | * Check if we can balance. We can balance on the whole |
@@ -511,6 +585,10 @@ retry: | |||
511 | do_balance = 0; | 585 | do_balance = 0; |
512 | 586 | ||
513 | for (slot = 0; slot < conf->copies ; slot++) { | 587 | for (slot = 0; slot < conf->copies ; slot++) { |
588 | sector_t first_bad; | ||
589 | int bad_sectors; | ||
590 | sector_t dev_sector; | ||
591 | |||
514 | if (r10_bio->devs[slot].bio == IO_BLOCKED) | 592 | if (r10_bio->devs[slot].bio == IO_BLOCKED) |
515 | continue; | 593 | continue; |
516 | disk = r10_bio->devs[slot].devnum; | 594 | disk = r10_bio->devs[slot].devnum; |
@@ -520,6 +598,37 @@ retry: | |||
520 | if (!test_bit(In_sync, &rdev->flags)) | 598 | if (!test_bit(In_sync, &rdev->flags)) |
521 | continue; | 599 | continue; |
522 | 600 | ||
601 | dev_sector = r10_bio->devs[slot].addr; | ||
602 | if (is_badblock(rdev, dev_sector, sectors, | ||
603 | &first_bad, &bad_sectors)) { | ||
604 | if (best_dist < MaxSector) | ||
605 | /* Already have a better slot */ | ||
606 | continue; | ||
607 | if (first_bad <= dev_sector) { | ||
608 | /* Cannot read here. If this is the | ||
609 | * 'primary' device, then we must not read | ||
610 | * beyond 'bad_sectors' from another device. | ||
611 | */ | ||
612 | bad_sectors -= (dev_sector - first_bad); | ||
613 | if (!do_balance && sectors > bad_sectors) | ||
614 | sectors = bad_sectors; | ||
615 | if (best_good_sectors > sectors) | ||
616 | best_good_sectors = sectors; | ||
617 | } else { | ||
618 | sector_t good_sectors = | ||
619 | first_bad - dev_sector; | ||
620 | if (good_sectors > best_good_sectors) { | ||
621 | best_good_sectors = good_sectors; | ||
622 | best_slot = slot; | ||
623 | } | ||
624 | if (!do_balance) | ||
625 | /* Must read from here */ | ||
626 | break; | ||
627 | } | ||
628 | continue; | ||
629 | } else | ||
630 | best_good_sectors = sectors; | ||
631 | |||
523 | if (!do_balance) | 632 | if (!do_balance) |
524 | break; | 633 | break; |
525 | 634 | ||
@@ -561,6 +670,7 @@ retry: | |||
561 | } else | 670 | } else |
562 | disk = -1; | 671 | disk = -1; |
563 | rcu_read_unlock(); | 672 | rcu_read_unlock(); |
673 | *max_sectors = best_good_sectors; | ||
564 | 674 | ||
565 | return disk; | 675 | return disk; |
566 | } | 676 | } |
@@ -734,6 +844,8 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
734 | unsigned long flags; | 844 | unsigned long flags; |
735 | mdk_rdev_t *blocked_rdev; | 845 | mdk_rdev_t *blocked_rdev; |
736 | int plugged; | 846 | int plugged; |
847 | int sectors_handled; | ||
848 | int max_sectors; | ||
737 | 849 | ||
738 | if (unlikely(bio->bi_rw & REQ_FLUSH)) { | 850 | if (unlikely(bio->bi_rw & REQ_FLUSH)) { |
739 | md_flush_request(mddev, bio); | 851 | md_flush_request(mddev, bio); |
@@ -808,12 +920,26 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
808 | r10_bio->sector = bio->bi_sector; | 920 | r10_bio->sector = bio->bi_sector; |
809 | r10_bio->state = 0; | 921 | r10_bio->state = 0; |
810 | 922 | ||
923 | /* We might need to issue multiple reads to different | ||
924 | * devices if there are bad blocks around, so we keep | ||
925 | * track of the number of reads in bio->bi_phys_segments. | ||
926 | * If this is 0, there is only one r10_bio and no locking | ||
927 | * will be needed when the request completes. If it is | ||
928 | * non-zero, then it is the number of not-completed requests. | ||
929 | */ | ||
930 | bio->bi_phys_segments = 0; | ||
931 | clear_bit(BIO_SEG_VALID, &bio->bi_flags); | ||
932 | |||
811 | if (rw == READ) { | 933 | if (rw == READ) { |
812 | /* | 934 | /* |
813 | * read balancing logic: | 935 | * read balancing logic: |
814 | */ | 936 | */ |
815 | int disk = read_balance(conf, r10_bio); | 937 | int disk; |
816 | int slot = r10_bio->read_slot; | 938 | int slot; |
939 | |||
940 | read_again: | ||
941 | disk = read_balance(conf, r10_bio, &max_sectors); | ||
942 | slot = r10_bio->read_slot; | ||
817 | if (disk < 0) { | 943 | if (disk < 0) { |
818 | raid_end_bio_io(r10_bio); | 944 | raid_end_bio_io(r10_bio); |
819 | return 0; | 945 | return 0; |
@@ -821,6 +947,8 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
821 | mirror = conf->mirrors + disk; | 947 | mirror = conf->mirrors + disk; |
822 | 948 | ||
823 | read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); | 949 | read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); |
950 | md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector, | ||
951 | max_sectors); | ||
824 | 952 | ||
825 | r10_bio->devs[slot].bio = read_bio; | 953 | r10_bio->devs[slot].bio = read_bio; |
826 | 954 | ||
@@ -831,7 +959,37 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
831 | read_bio->bi_rw = READ | do_sync; | 959 | read_bio->bi_rw = READ | do_sync; |
832 | read_bio->bi_private = r10_bio; | 960 | read_bio->bi_private = r10_bio; |
833 | 961 | ||
834 | generic_make_request(read_bio); | 962 | if (max_sectors < r10_bio->sectors) { |
963 | /* Could not read all from this device, so we will | ||
964 | * need another r10_bio. | ||
965 | */ | ||
966 | sectors_handled = (r10_bio->sectors + max_sectors | ||
967 | - bio->bi_sector); | ||
968 | r10_bio->sectors = max_sectors; | ||
969 | spin_lock_irq(&conf->device_lock); | ||
970 | if (bio->bi_phys_segments == 0) | ||
971 | bio->bi_phys_segments = 2; | ||
972 | else | ||
973 | bio->bi_phys_segments++; | ||
974 | spin_unlock(&conf->device_lock); | ||
975 | /* Cannot call generic_make_request directly | ||
976 | * as that will be queued in __generic_make_request | ||
977 | * and subsequent mempool_alloc might block | ||
978 | * waiting for it. so hand bio over to raid10d. | ||
979 | */ | ||
980 | reschedule_retry(r10_bio); | ||
981 | |||
982 | r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); | ||
983 | |||
984 | r10_bio->master_bio = bio; | ||
985 | r10_bio->sectors = ((bio->bi_size >> 9) | ||
986 | - sectors_handled); | ||
987 | r10_bio->state = 0; | ||
988 | r10_bio->mddev = mddev; | ||
989 | r10_bio->sector = bio->bi_sector + sectors_handled; | ||
990 | goto read_again; | ||
991 | } else | ||
992 | generic_make_request(read_bio); | ||
835 | return 0; | 993 | return 0; |
836 | } | 994 | } |
837 | 995 | ||
@@ -841,13 +999,22 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
841 | /* first select target devices under rcu_lock and | 999 | /* first select target devices under rcu_lock and |
842 | * inc refcount on their rdev. Record them by setting | 1000 | * inc refcount on their rdev. Record them by setting |
843 | * bios[x] to bio | 1001 | * bios[x] to bio |
1002 | * If there are known/acknowledged bad blocks on any device | ||
1003 | * on which we have seen a write error, we want to avoid | ||
1004 | * writing to those blocks. This potentially requires several | ||
1005 | * writes to write around the bad blocks. Each set of writes | ||
1006 | * gets its own r10_bio with a set of bios attached. The number | ||
1007 | * of r10_bios is recored in bio->bi_phys_segments just as with | ||
1008 | * the read case. | ||
844 | */ | 1009 | */ |
845 | plugged = mddev_check_plugged(mddev); | 1010 | plugged = mddev_check_plugged(mddev); |
846 | 1011 | ||
847 | raid10_find_phys(conf, r10_bio); | 1012 | raid10_find_phys(conf, r10_bio); |
848 | retry_write: | 1013 | retry_write: |
849 | blocked_rdev = NULL; | 1014 | blocked_rdev = NULL; |
850 | rcu_read_lock(); | 1015 | rcu_read_lock(); |
1016 | max_sectors = r10_bio->sectors; | ||
1017 | |||
851 | for (i = 0; i < conf->copies; i++) { | 1018 | for (i = 0; i < conf->copies; i++) { |
852 | int d = r10_bio->devs[i].devnum; | 1019 | int d = r10_bio->devs[i].devnum; |
853 | mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev); | 1020 | mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev); |
@@ -856,13 +1023,55 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
856 | blocked_rdev = rdev; | 1023 | blocked_rdev = rdev; |
857 | break; | 1024 | break; |
858 | } | 1025 | } |
859 | if (rdev && !test_bit(Faulty, &rdev->flags)) { | 1026 | r10_bio->devs[i].bio = NULL; |
860 | atomic_inc(&rdev->nr_pending); | 1027 | if (!rdev || test_bit(Faulty, &rdev->flags)) { |
861 | r10_bio->devs[i].bio = bio; | ||
862 | } else { | ||
863 | r10_bio->devs[i].bio = NULL; | ||
864 | set_bit(R10BIO_Degraded, &r10_bio->state); | 1028 | set_bit(R10BIO_Degraded, &r10_bio->state); |
1029 | continue; | ||
1030 | } | ||
1031 | if (test_bit(WriteErrorSeen, &rdev->flags)) { | ||
1032 | sector_t first_bad; | ||
1033 | sector_t dev_sector = r10_bio->devs[i].addr; | ||
1034 | int bad_sectors; | ||
1035 | int is_bad; | ||
1036 | |||
1037 | is_bad = is_badblock(rdev, dev_sector, | ||
1038 | max_sectors, | ||
1039 | &first_bad, &bad_sectors); | ||
1040 | if (is_bad < 0) { | ||
1041 | /* Mustn't write here until the bad block | ||
1042 | * is acknowledged | ||
1043 | */ | ||
1044 | atomic_inc(&rdev->nr_pending); | ||
1045 | set_bit(BlockedBadBlocks, &rdev->flags); | ||
1046 | blocked_rdev = rdev; | ||
1047 | break; | ||
1048 | } | ||
1049 | if (is_bad && first_bad <= dev_sector) { | ||
1050 | /* Cannot write here at all */ | ||
1051 | bad_sectors -= (dev_sector - first_bad); | ||
1052 | if (bad_sectors < max_sectors) | ||
1053 | /* Mustn't write more than bad_sectors | ||
1054 | * to other devices yet | ||
1055 | */ | ||
1056 | max_sectors = bad_sectors; | ||
1057 | /* We don't set R10BIO_Degraded as that | ||
1058 | * only applies if the disk is missing, | ||
1059 | * so it might be re-added, and we want to | ||
1060 | * know to recover this chunk. | ||
1061 | * In this case the device is here, and the | ||
1062 | * fact that this chunk is not in-sync is | ||
1063 | * recorded in the bad block log. | ||
1064 | */ | ||
1065 | continue; | ||
1066 | } | ||
1067 | if (is_bad) { | ||
1068 | int good_sectors = first_bad - dev_sector; | ||
1069 | if (good_sectors < max_sectors) | ||
1070 | max_sectors = good_sectors; | ||
1071 | } | ||
865 | } | 1072 | } |
1073 | r10_bio->devs[i].bio = bio; | ||
1074 | atomic_inc(&rdev->nr_pending); | ||
866 | } | 1075 | } |
867 | rcu_read_unlock(); | 1076 | rcu_read_unlock(); |
868 | 1077 | ||
@@ -882,8 +1091,22 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
882 | goto retry_write; | 1091 | goto retry_write; |
883 | } | 1092 | } |
884 | 1093 | ||
1094 | if (max_sectors < r10_bio->sectors) { | ||
1095 | /* We are splitting this into multiple parts, so | ||
1096 | * we need to prepare for allocating another r10_bio. | ||
1097 | */ | ||
1098 | r10_bio->sectors = max_sectors; | ||
1099 | spin_lock_irq(&conf->device_lock); | ||
1100 | if (bio->bi_phys_segments == 0) | ||
1101 | bio->bi_phys_segments = 2; | ||
1102 | else | ||
1103 | bio->bi_phys_segments++; | ||
1104 | spin_unlock_irq(&conf->device_lock); | ||
1105 | } | ||
1106 | sectors_handled = r10_bio->sector + max_sectors - bio->bi_sector; | ||
1107 | |||
885 | atomic_set(&r10_bio->remaining, 1); | 1108 | atomic_set(&r10_bio->remaining, 1); |
886 | bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0); | 1109 | bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0); |
887 | 1110 | ||
888 | for (i = 0; i < conf->copies; i++) { | 1111 | for (i = 0; i < conf->copies; i++) { |
889 | struct bio *mbio; | 1112 | struct bio *mbio; |
@@ -892,10 +1115,12 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
892 | continue; | 1115 | continue; |
893 | 1116 | ||
894 | mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); | 1117 | mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); |
1118 | md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, | ||
1119 | max_sectors); | ||
895 | r10_bio->devs[i].bio = mbio; | 1120 | r10_bio->devs[i].bio = mbio; |
896 | 1121 | ||
897 | mbio->bi_sector = r10_bio->devs[i].addr+ | 1122 | mbio->bi_sector = (r10_bio->devs[i].addr+ |
898 | conf->mirrors[d].rdev->data_offset; | 1123 | conf->mirrors[d].rdev->data_offset); |
899 | mbio->bi_bdev = conf->mirrors[d].rdev->bdev; | 1124 | mbio->bi_bdev = conf->mirrors[d].rdev->bdev; |
900 | mbio->bi_end_io = raid10_end_write_request; | 1125 | mbio->bi_end_io = raid10_end_write_request; |
901 | mbio->bi_rw = WRITE | do_sync | do_fua; | 1126 | mbio->bi_rw = WRITE | do_sync | do_fua; |
@@ -907,15 +1132,26 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
907 | spin_unlock_irqrestore(&conf->device_lock, flags); | 1132 | spin_unlock_irqrestore(&conf->device_lock, flags); |
908 | } | 1133 | } |
909 | 1134 | ||
910 | if (atomic_dec_and_test(&r10_bio->remaining)) { | 1135 | /* Don't remove the bias on 'remaining' (one_write_done) until |
911 | /* This matches the end of raid10_end_write_request() */ | 1136 | * after checking if we need to go around again. |
912 | bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector, | 1137 | */ |
913 | r10_bio->sectors, | 1138 | |
914 | !test_bit(R10BIO_Degraded, &r10_bio->state), | 1139 | if (sectors_handled < (bio->bi_size >> 9)) { |
915 | 0); | 1140 | one_write_done(r10_bio); |
916 | md_write_end(mddev); | 1141 | /* We need another r10_bio. It has already been counted |
917 | raid_end_bio_io(r10_bio); | 1142 | * in bio->bi_phys_segments. |
1143 | */ | ||
1144 | r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); | ||
1145 | |||
1146 | r10_bio->master_bio = bio; | ||
1147 | r10_bio->sectors = (bio->bi_size >> 9) - sectors_handled; | ||
1148 | |||
1149 | r10_bio->mddev = mddev; | ||
1150 | r10_bio->sector = bio->bi_sector + sectors_handled; | ||
1151 | r10_bio->state = 0; | ||
1152 | goto retry_write; | ||
918 | } | 1153 | } |
1154 | one_write_done(r10_bio); | ||
919 | 1155 | ||
920 | /* In case raid10d snuck in to freeze_array */ | 1156 | /* In case raid10d snuck in to freeze_array */ |
921 | wake_up(&conf->wait_barrier); | 1157 | wake_up(&conf->wait_barrier); |
@@ -949,6 +1185,30 @@ static void status(struct seq_file *seq, mddev_t *mddev) | |||
949 | seq_printf(seq, "]"); | 1185 | seq_printf(seq, "]"); |
950 | } | 1186 | } |
951 | 1187 | ||
1188 | /* check if there are enough drives for | ||
1189 | * every block to appear on atleast one. | ||
1190 | * Don't consider the device numbered 'ignore' | ||
1191 | * as we might be about to remove it. | ||
1192 | */ | ||
1193 | static int enough(conf_t *conf, int ignore) | ||
1194 | { | ||
1195 | int first = 0; | ||
1196 | |||
1197 | do { | ||
1198 | int n = conf->copies; | ||
1199 | int cnt = 0; | ||
1200 | while (n--) { | ||
1201 | if (conf->mirrors[first].rdev && | ||
1202 | first != ignore) | ||
1203 | cnt++; | ||
1204 | first = (first+1) % conf->raid_disks; | ||
1205 | } | ||
1206 | if (cnt == 0) | ||
1207 | return 0; | ||
1208 | } while (first != 0); | ||
1209 | return 1; | ||
1210 | } | ||
1211 | |||
952 | static void error(mddev_t *mddev, mdk_rdev_t *rdev) | 1212 | static void error(mddev_t *mddev, mdk_rdev_t *rdev) |
953 | { | 1213 | { |
954 | char b[BDEVNAME_SIZE]; | 1214 | char b[BDEVNAME_SIZE]; |
@@ -961,13 +1221,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) | |||
961 | * else mark the drive as failed | 1221 | * else mark the drive as failed |
962 | */ | 1222 | */ |
963 | if (test_bit(In_sync, &rdev->flags) | 1223 | if (test_bit(In_sync, &rdev->flags) |
964 | && conf->raid_disks-mddev->degraded == 1) | 1224 | && !enough(conf, rdev->raid_disk)) |
965 | /* | 1225 | /* |
966 | * Don't fail the drive, just return an IO error. | 1226 | * Don't fail the drive, just return an IO error. |
967 | * The test should really be more sophisticated than | ||
968 | * "working_disks == 1", but it isn't critical, and | ||
969 | * can wait until we do more sophisticated "is the drive | ||
970 | * really dead" tests... | ||
971 | */ | 1227 | */ |
972 | return; | 1228 | return; |
973 | if (test_and_clear_bit(In_sync, &rdev->flags)) { | 1229 | if (test_and_clear_bit(In_sync, &rdev->flags)) { |
@@ -980,6 +1236,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) | |||
980 | */ | 1236 | */ |
981 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | 1237 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); |
982 | } | 1238 | } |
1239 | set_bit(Blocked, &rdev->flags); | ||
983 | set_bit(Faulty, &rdev->flags); | 1240 | set_bit(Faulty, &rdev->flags); |
984 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 1241 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
985 | printk(KERN_ALERT | 1242 | printk(KERN_ALERT |
@@ -1022,27 +1279,6 @@ static void close_sync(conf_t *conf) | |||
1022 | conf->r10buf_pool = NULL; | 1279 | conf->r10buf_pool = NULL; |
1023 | } | 1280 | } |
1024 | 1281 | ||
1025 | /* check if there are enough drives for | ||
1026 | * every block to appear on atleast one | ||
1027 | */ | ||
1028 | static int enough(conf_t *conf) | ||
1029 | { | ||
1030 | int first = 0; | ||
1031 | |||
1032 | do { | ||
1033 | int n = conf->copies; | ||
1034 | int cnt = 0; | ||
1035 | while (n--) { | ||
1036 | if (conf->mirrors[first].rdev) | ||
1037 | cnt++; | ||
1038 | first = (first+1) % conf->raid_disks; | ||
1039 | } | ||
1040 | if (cnt == 0) | ||
1041 | return 0; | ||
1042 | } while (first != 0); | ||
1043 | return 1; | ||
1044 | } | ||
1045 | |||
1046 | static int raid10_spare_active(mddev_t *mddev) | 1282 | static int raid10_spare_active(mddev_t *mddev) |
1047 | { | 1283 | { |
1048 | int i; | 1284 | int i; |
@@ -1078,7 +1314,6 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1078 | conf_t *conf = mddev->private; | 1314 | conf_t *conf = mddev->private; |
1079 | int err = -EEXIST; | 1315 | int err = -EEXIST; |
1080 | int mirror; | 1316 | int mirror; |
1081 | mirror_info_t *p; | ||
1082 | int first = 0; | 1317 | int first = 0; |
1083 | int last = conf->raid_disks - 1; | 1318 | int last = conf->raid_disks - 1; |
1084 | 1319 | ||
@@ -1087,44 +1322,47 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1087 | * very different from resync | 1322 | * very different from resync |
1088 | */ | 1323 | */ |
1089 | return -EBUSY; | 1324 | return -EBUSY; |
1090 | if (!enough(conf)) | 1325 | if (!enough(conf, -1)) |
1091 | return -EINVAL; | 1326 | return -EINVAL; |
1092 | 1327 | ||
1093 | if (rdev->raid_disk >= 0) | 1328 | if (rdev->raid_disk >= 0) |
1094 | first = last = rdev->raid_disk; | 1329 | first = last = rdev->raid_disk; |
1095 | 1330 | ||
1096 | if (rdev->saved_raid_disk >= 0 && | 1331 | if (rdev->saved_raid_disk >= first && |
1097 | rdev->saved_raid_disk >= first && | ||
1098 | conf->mirrors[rdev->saved_raid_disk].rdev == NULL) | 1332 | conf->mirrors[rdev->saved_raid_disk].rdev == NULL) |
1099 | mirror = rdev->saved_raid_disk; | 1333 | mirror = rdev->saved_raid_disk; |
1100 | else | 1334 | else |
1101 | mirror = first; | 1335 | mirror = first; |
1102 | for ( ; mirror <= last ; mirror++) | 1336 | for ( ; mirror <= last ; mirror++) { |
1103 | if ( !(p=conf->mirrors+mirror)->rdev) { | 1337 | mirror_info_t *p = &conf->mirrors[mirror]; |
1104 | 1338 | if (p->recovery_disabled == mddev->recovery_disabled) | |
1105 | disk_stack_limits(mddev->gendisk, rdev->bdev, | 1339 | continue; |
1106 | rdev->data_offset << 9); | 1340 | if (p->rdev) |
1107 | /* as we don't honour merge_bvec_fn, we must | 1341 | continue; |
1108 | * never risk violating it, so limit | ||
1109 | * ->max_segments to one lying with a single | ||
1110 | * page, as a one page request is never in | ||
1111 | * violation. | ||
1112 | */ | ||
1113 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { | ||
1114 | blk_queue_max_segments(mddev->queue, 1); | ||
1115 | blk_queue_segment_boundary(mddev->queue, | ||
1116 | PAGE_CACHE_SIZE - 1); | ||
1117 | } | ||
1118 | 1342 | ||
1119 | p->head_position = 0; | 1343 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
1120 | rdev->raid_disk = mirror; | 1344 | rdev->data_offset << 9); |
1121 | err = 0; | 1345 | /* as we don't honour merge_bvec_fn, we must |
1122 | if (rdev->saved_raid_disk != mirror) | 1346 | * never risk violating it, so limit |
1123 | conf->fullsync = 1; | 1347 | * ->max_segments to one lying with a single |
1124 | rcu_assign_pointer(p->rdev, rdev); | 1348 | * page, as a one page request is never in |
1125 | break; | 1349 | * violation. |
1350 | */ | ||
1351 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { | ||
1352 | blk_queue_max_segments(mddev->queue, 1); | ||
1353 | blk_queue_segment_boundary(mddev->queue, | ||
1354 | PAGE_CACHE_SIZE - 1); | ||
1126 | } | 1355 | } |
1127 | 1356 | ||
1357 | p->head_position = 0; | ||
1358 | rdev->raid_disk = mirror; | ||
1359 | err = 0; | ||
1360 | if (rdev->saved_raid_disk != mirror) | ||
1361 | conf->fullsync = 1; | ||
1362 | rcu_assign_pointer(p->rdev, rdev); | ||
1363 | break; | ||
1364 | } | ||
1365 | |||
1128 | md_integrity_add_rdev(rdev, mddev); | 1366 | md_integrity_add_rdev(rdev, mddev); |
1129 | print_conf(conf); | 1367 | print_conf(conf); |
1130 | return err; | 1368 | return err; |
@@ -1149,7 +1387,8 @@ static int raid10_remove_disk(mddev_t *mddev, int number) | |||
1149 | * is not possible. | 1387 | * is not possible. |
1150 | */ | 1388 | */ |
1151 | if (!test_bit(Faulty, &rdev->flags) && | 1389 | if (!test_bit(Faulty, &rdev->flags) && |
1152 | enough(conf)) { | 1390 | mddev->recovery_disabled != p->recovery_disabled && |
1391 | enough(conf, -1)) { | ||
1153 | err = -EBUSY; | 1392 | err = -EBUSY; |
1154 | goto abort; | 1393 | goto abort; |
1155 | } | 1394 | } |
@@ -1174,24 +1413,18 @@ static void end_sync_read(struct bio *bio, int error) | |||
1174 | { | 1413 | { |
1175 | r10bio_t *r10_bio = bio->bi_private; | 1414 | r10bio_t *r10_bio = bio->bi_private; |
1176 | conf_t *conf = r10_bio->mddev->private; | 1415 | conf_t *conf = r10_bio->mddev->private; |
1177 | int i,d; | 1416 | int d; |
1178 | 1417 | ||
1179 | for (i=0; i<conf->copies; i++) | 1418 | d = find_bio_disk(conf, r10_bio, bio, NULL); |
1180 | if (r10_bio->devs[i].bio == bio) | ||
1181 | break; | ||
1182 | BUG_ON(i == conf->copies); | ||
1183 | update_head_pos(i, r10_bio); | ||
1184 | d = r10_bio->devs[i].devnum; | ||
1185 | 1419 | ||
1186 | if (test_bit(BIO_UPTODATE, &bio->bi_flags)) | 1420 | if (test_bit(BIO_UPTODATE, &bio->bi_flags)) |
1187 | set_bit(R10BIO_Uptodate, &r10_bio->state); | 1421 | set_bit(R10BIO_Uptodate, &r10_bio->state); |
1188 | else { | 1422 | else |
1423 | /* The write handler will notice the lack of | ||
1424 | * R10BIO_Uptodate and record any errors etc | ||
1425 | */ | ||
1189 | atomic_add(r10_bio->sectors, | 1426 | atomic_add(r10_bio->sectors, |
1190 | &conf->mirrors[d].rdev->corrected_errors); | 1427 | &conf->mirrors[d].rdev->corrected_errors); |
1191 | if (!test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery)) | ||
1192 | md_error(r10_bio->mddev, | ||
1193 | conf->mirrors[d].rdev); | ||
1194 | } | ||
1195 | 1428 | ||
1196 | /* for reconstruct, we always reschedule after a read. | 1429 | /* for reconstruct, we always reschedule after a read. |
1197 | * for resync, only after all reads | 1430 | * for resync, only after all reads |
@@ -1206,40 +1439,60 @@ static void end_sync_read(struct bio *bio, int error) | |||
1206 | } | 1439 | } |
1207 | } | 1440 | } |
1208 | 1441 | ||
1209 | static void end_sync_write(struct bio *bio, int error) | 1442 | static void end_sync_request(r10bio_t *r10_bio) |
1210 | { | 1443 | { |
1211 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | ||
1212 | r10bio_t *r10_bio = bio->bi_private; | ||
1213 | mddev_t *mddev = r10_bio->mddev; | 1444 | mddev_t *mddev = r10_bio->mddev; |
1214 | conf_t *conf = mddev->private; | ||
1215 | int i,d; | ||
1216 | |||
1217 | for (i = 0; i < conf->copies; i++) | ||
1218 | if (r10_bio->devs[i].bio == bio) | ||
1219 | break; | ||
1220 | d = r10_bio->devs[i].devnum; | ||
1221 | |||
1222 | if (!uptodate) | ||
1223 | md_error(mddev, conf->mirrors[d].rdev); | ||
1224 | |||
1225 | update_head_pos(i, r10_bio); | ||
1226 | 1445 | ||
1227 | rdev_dec_pending(conf->mirrors[d].rdev, mddev); | ||
1228 | while (atomic_dec_and_test(&r10_bio->remaining)) { | 1446 | while (atomic_dec_and_test(&r10_bio->remaining)) { |
1229 | if (r10_bio->master_bio == NULL) { | 1447 | if (r10_bio->master_bio == NULL) { |
1230 | /* the primary of several recovery bios */ | 1448 | /* the primary of several recovery bios */ |
1231 | sector_t s = r10_bio->sectors; | 1449 | sector_t s = r10_bio->sectors; |
1232 | put_buf(r10_bio); | 1450 | if (test_bit(R10BIO_MadeGood, &r10_bio->state) || |
1451 | test_bit(R10BIO_WriteError, &r10_bio->state)) | ||
1452 | reschedule_retry(r10_bio); | ||
1453 | else | ||
1454 | put_buf(r10_bio); | ||
1233 | md_done_sync(mddev, s, 1); | 1455 | md_done_sync(mddev, s, 1); |
1234 | break; | 1456 | break; |
1235 | } else { | 1457 | } else { |
1236 | r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio; | 1458 | r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio; |
1237 | put_buf(r10_bio); | 1459 | if (test_bit(R10BIO_MadeGood, &r10_bio->state) || |
1460 | test_bit(R10BIO_WriteError, &r10_bio->state)) | ||
1461 | reschedule_retry(r10_bio); | ||
1462 | else | ||
1463 | put_buf(r10_bio); | ||
1238 | r10_bio = r10_bio2; | 1464 | r10_bio = r10_bio2; |
1239 | } | 1465 | } |
1240 | } | 1466 | } |
1241 | } | 1467 | } |
1242 | 1468 | ||
1469 | static void end_sync_write(struct bio *bio, int error) | ||
1470 | { | ||
1471 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | ||
1472 | r10bio_t *r10_bio = bio->bi_private; | ||
1473 | mddev_t *mddev = r10_bio->mddev; | ||
1474 | conf_t *conf = mddev->private; | ||
1475 | int d; | ||
1476 | sector_t first_bad; | ||
1477 | int bad_sectors; | ||
1478 | int slot; | ||
1479 | |||
1480 | d = find_bio_disk(conf, r10_bio, bio, &slot); | ||
1481 | |||
1482 | if (!uptodate) { | ||
1483 | set_bit(WriteErrorSeen, &conf->mirrors[d].rdev->flags); | ||
1484 | set_bit(R10BIO_WriteError, &r10_bio->state); | ||
1485 | } else if (is_badblock(conf->mirrors[d].rdev, | ||
1486 | r10_bio->devs[slot].addr, | ||
1487 | r10_bio->sectors, | ||
1488 | &first_bad, &bad_sectors)) | ||
1489 | set_bit(R10BIO_MadeGood, &r10_bio->state); | ||
1490 | |||
1491 | rdev_dec_pending(conf->mirrors[d].rdev, mddev); | ||
1492 | |||
1493 | end_sync_request(r10_bio); | ||
1494 | } | ||
1495 | |||
1243 | /* | 1496 | /* |
1244 | * Note: sync and recover and handled very differently for raid10 | 1497 | * Note: sync and recover and handled very differently for raid10 |
1245 | * This code is for resync. | 1498 | * This code is for resync. |
@@ -1299,11 +1552,12 @@ static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio) | |||
1299 | if (j == vcnt) | 1552 | if (j == vcnt) |
1300 | continue; | 1553 | continue; |
1301 | mddev->resync_mismatches += r10_bio->sectors; | 1554 | mddev->resync_mismatches += r10_bio->sectors; |
1555 | if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) | ||
1556 | /* Don't fix anything. */ | ||
1557 | continue; | ||
1302 | } | 1558 | } |
1303 | if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) | 1559 | /* Ok, we need to write this bio, either to correct an |
1304 | /* Don't fix anything. */ | 1560 | * inconsistency or to correct an unreadable block. |
1305 | continue; | ||
1306 | /* Ok, we need to write this bio | ||
1307 | * First we need to fixup bv_offset, bv_len and | 1561 | * First we need to fixup bv_offset, bv_len and |
1308 | * bi_vecs, as the read request might have corrupted these | 1562 | * bi_vecs, as the read request might have corrupted these |
1309 | */ | 1563 | */ |
@@ -1355,32 +1609,107 @@ done: | |||
1355 | * The second for writing. | 1609 | * The second for writing. |
1356 | * | 1610 | * |
1357 | */ | 1611 | */ |
1612 | static void fix_recovery_read_error(r10bio_t *r10_bio) | ||
1613 | { | ||
1614 | /* We got a read error during recovery. | ||
1615 | * We repeat the read in smaller page-sized sections. | ||
1616 | * If a read succeeds, write it to the new device or record | ||
1617 | * a bad block if we cannot. | ||
1618 | * If a read fails, record a bad block on both old and | ||
1619 | * new devices. | ||
1620 | */ | ||
1621 | mddev_t *mddev = r10_bio->mddev; | ||
1622 | conf_t *conf = mddev->private; | ||
1623 | struct bio *bio = r10_bio->devs[0].bio; | ||
1624 | sector_t sect = 0; | ||
1625 | int sectors = r10_bio->sectors; | ||
1626 | int idx = 0; | ||
1627 | int dr = r10_bio->devs[0].devnum; | ||
1628 | int dw = r10_bio->devs[1].devnum; | ||
1629 | |||
1630 | while (sectors) { | ||
1631 | int s = sectors; | ||
1632 | mdk_rdev_t *rdev; | ||
1633 | sector_t addr; | ||
1634 | int ok; | ||
1635 | |||
1636 | if (s > (PAGE_SIZE>>9)) | ||
1637 | s = PAGE_SIZE >> 9; | ||
1638 | |||
1639 | rdev = conf->mirrors[dr].rdev; | ||
1640 | addr = r10_bio->devs[0].addr + sect, | ||
1641 | ok = sync_page_io(rdev, | ||
1642 | addr, | ||
1643 | s << 9, | ||
1644 | bio->bi_io_vec[idx].bv_page, | ||
1645 | READ, false); | ||
1646 | if (ok) { | ||
1647 | rdev = conf->mirrors[dw].rdev; | ||
1648 | addr = r10_bio->devs[1].addr + sect; | ||
1649 | ok = sync_page_io(rdev, | ||
1650 | addr, | ||
1651 | s << 9, | ||
1652 | bio->bi_io_vec[idx].bv_page, | ||
1653 | WRITE, false); | ||
1654 | if (!ok) | ||
1655 | set_bit(WriteErrorSeen, &rdev->flags); | ||
1656 | } | ||
1657 | if (!ok) { | ||
1658 | /* We don't worry if we cannot set a bad block - | ||
1659 | * it really is bad so there is no loss in not | ||
1660 | * recording it yet | ||
1661 | */ | ||
1662 | rdev_set_badblocks(rdev, addr, s, 0); | ||
1663 | |||
1664 | if (rdev != conf->mirrors[dw].rdev) { | ||
1665 | /* need bad block on destination too */ | ||
1666 | mdk_rdev_t *rdev2 = conf->mirrors[dw].rdev; | ||
1667 | addr = r10_bio->devs[1].addr + sect; | ||
1668 | ok = rdev_set_badblocks(rdev2, addr, s, 0); | ||
1669 | if (!ok) { | ||
1670 | /* just abort the recovery */ | ||
1671 | printk(KERN_NOTICE | ||
1672 | "md/raid10:%s: recovery aborted" | ||
1673 | " due to read error\n", | ||
1674 | mdname(mddev)); | ||
1675 | |||
1676 | conf->mirrors[dw].recovery_disabled | ||
1677 | = mddev->recovery_disabled; | ||
1678 | set_bit(MD_RECOVERY_INTR, | ||
1679 | &mddev->recovery); | ||
1680 | break; | ||
1681 | } | ||
1682 | } | ||
1683 | } | ||
1684 | |||
1685 | sectors -= s; | ||
1686 | sect += s; | ||
1687 | idx++; | ||
1688 | } | ||
1689 | } | ||
1358 | 1690 | ||
1359 | static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio) | 1691 | static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio) |
1360 | { | 1692 | { |
1361 | conf_t *conf = mddev->private; | 1693 | conf_t *conf = mddev->private; |
1362 | int i, d; | 1694 | int d; |
1363 | struct bio *bio, *wbio; | 1695 | struct bio *wbio; |
1364 | 1696 | ||
1697 | if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) { | ||
1698 | fix_recovery_read_error(r10_bio); | ||
1699 | end_sync_request(r10_bio); | ||
1700 | return; | ||
1701 | } | ||
1365 | 1702 | ||
1366 | /* move the pages across to the second bio | 1703 | /* |
1704 | * share the pages with the first bio | ||
1367 | * and submit the write request | 1705 | * and submit the write request |
1368 | */ | 1706 | */ |
1369 | bio = r10_bio->devs[0].bio; | ||
1370 | wbio = r10_bio->devs[1].bio; | 1707 | wbio = r10_bio->devs[1].bio; |
1371 | for (i=0; i < wbio->bi_vcnt; i++) { | ||
1372 | struct page *p = bio->bi_io_vec[i].bv_page; | ||
1373 | bio->bi_io_vec[i].bv_page = wbio->bi_io_vec[i].bv_page; | ||
1374 | wbio->bi_io_vec[i].bv_page = p; | ||
1375 | } | ||
1376 | d = r10_bio->devs[1].devnum; | 1708 | d = r10_bio->devs[1].devnum; |
1377 | 1709 | ||
1378 | atomic_inc(&conf->mirrors[d].rdev->nr_pending); | 1710 | atomic_inc(&conf->mirrors[d].rdev->nr_pending); |
1379 | md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9); | 1711 | md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9); |
1380 | if (test_bit(R10BIO_Uptodate, &r10_bio->state)) | 1712 | generic_make_request(wbio); |
1381 | generic_make_request(wbio); | ||
1382 | else | ||
1383 | bio_endio(wbio, -EIO); | ||
1384 | } | 1713 | } |
1385 | 1714 | ||
1386 | 1715 | ||
@@ -1421,6 +1750,26 @@ static void check_decay_read_errors(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1421 | atomic_set(&rdev->read_errors, read_errors >> hours_since_last); | 1750 | atomic_set(&rdev->read_errors, read_errors >> hours_since_last); |
1422 | } | 1751 | } |
1423 | 1752 | ||
1753 | static int r10_sync_page_io(mdk_rdev_t *rdev, sector_t sector, | ||
1754 | int sectors, struct page *page, int rw) | ||
1755 | { | ||
1756 | sector_t first_bad; | ||
1757 | int bad_sectors; | ||
1758 | |||
1759 | if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors) | ||
1760 | && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags))) | ||
1761 | return -1; | ||
1762 | if (sync_page_io(rdev, sector, sectors << 9, page, rw, false)) | ||
1763 | /* success */ | ||
1764 | return 1; | ||
1765 | if (rw == WRITE) | ||
1766 | set_bit(WriteErrorSeen, &rdev->flags); | ||
1767 | /* need to record an error - either for the block or the device */ | ||
1768 | if (!rdev_set_badblocks(rdev, sector, sectors, 0)) | ||
1769 | md_error(rdev->mddev, rdev); | ||
1770 | return 0; | ||
1771 | } | ||
1772 | |||
1424 | /* | 1773 | /* |
1425 | * This is a kernel thread which: | 1774 | * This is a kernel thread which: |
1426 | * | 1775 | * |
@@ -1476,10 +1825,15 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
1476 | 1825 | ||
1477 | rcu_read_lock(); | 1826 | rcu_read_lock(); |
1478 | do { | 1827 | do { |
1828 | sector_t first_bad; | ||
1829 | int bad_sectors; | ||
1830 | |||
1479 | d = r10_bio->devs[sl].devnum; | 1831 | d = r10_bio->devs[sl].devnum; |
1480 | rdev = rcu_dereference(conf->mirrors[d].rdev); | 1832 | rdev = rcu_dereference(conf->mirrors[d].rdev); |
1481 | if (rdev && | 1833 | if (rdev && |
1482 | test_bit(In_sync, &rdev->flags)) { | 1834 | test_bit(In_sync, &rdev->flags) && |
1835 | is_badblock(rdev, r10_bio->devs[sl].addr + sect, s, | ||
1836 | &first_bad, &bad_sectors) == 0) { | ||
1483 | atomic_inc(&rdev->nr_pending); | 1837 | atomic_inc(&rdev->nr_pending); |
1484 | rcu_read_unlock(); | 1838 | rcu_read_unlock(); |
1485 | success = sync_page_io(rdev, | 1839 | success = sync_page_io(rdev, |
@@ -1499,9 +1853,19 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
1499 | rcu_read_unlock(); | 1853 | rcu_read_unlock(); |
1500 | 1854 | ||
1501 | if (!success) { | 1855 | if (!success) { |
1502 | /* Cannot read from anywhere -- bye bye array */ | 1856 | /* Cannot read from anywhere, just mark the block |
1857 | * as bad on the first device to discourage future | ||
1858 | * reads. | ||
1859 | */ | ||
1503 | int dn = r10_bio->devs[r10_bio->read_slot].devnum; | 1860 | int dn = r10_bio->devs[r10_bio->read_slot].devnum; |
1504 | md_error(mddev, conf->mirrors[dn].rdev); | 1861 | rdev = conf->mirrors[dn].rdev; |
1862 | |||
1863 | if (!rdev_set_badblocks( | ||
1864 | rdev, | ||
1865 | r10_bio->devs[r10_bio->read_slot].addr | ||
1866 | + sect, | ||
1867 | s, 0)) | ||
1868 | md_error(mddev, rdev); | ||
1505 | break; | 1869 | break; |
1506 | } | 1870 | } |
1507 | 1871 | ||
@@ -1516,80 +1880,82 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
1516 | sl--; | 1880 | sl--; |
1517 | d = r10_bio->devs[sl].devnum; | 1881 | d = r10_bio->devs[sl].devnum; |
1518 | rdev = rcu_dereference(conf->mirrors[d].rdev); | 1882 | rdev = rcu_dereference(conf->mirrors[d].rdev); |
1519 | if (rdev && | 1883 | if (!rdev || |
1520 | test_bit(In_sync, &rdev->flags)) { | 1884 | !test_bit(In_sync, &rdev->flags)) |
1521 | atomic_inc(&rdev->nr_pending); | 1885 | continue; |
1522 | rcu_read_unlock(); | 1886 | |
1523 | atomic_add(s, &rdev->corrected_errors); | 1887 | atomic_inc(&rdev->nr_pending); |
1524 | if (sync_page_io(rdev, | 1888 | rcu_read_unlock(); |
1525 | r10_bio->devs[sl].addr + | 1889 | if (r10_sync_page_io(rdev, |
1526 | sect, | 1890 | r10_bio->devs[sl].addr + |
1527 | s<<9, conf->tmppage, WRITE, false) | 1891 | sect, |
1528 | == 0) { | 1892 | s<<9, conf->tmppage, WRITE) |
1529 | /* Well, this device is dead */ | 1893 | == 0) { |
1530 | printk(KERN_NOTICE | 1894 | /* Well, this device is dead */ |
1531 | "md/raid10:%s: read correction " | 1895 | printk(KERN_NOTICE |
1532 | "write failed" | 1896 | "md/raid10:%s: read correction " |
1533 | " (%d sectors at %llu on %s)\n", | 1897 | "write failed" |
1534 | mdname(mddev), s, | 1898 | " (%d sectors at %llu on %s)\n", |
1535 | (unsigned long long)( | 1899 | mdname(mddev), s, |
1536 | sect + rdev->data_offset), | 1900 | (unsigned long long)( |
1537 | bdevname(rdev->bdev, b)); | 1901 | sect + rdev->data_offset), |
1538 | printk(KERN_NOTICE "md/raid10:%s: %s: failing " | 1902 | bdevname(rdev->bdev, b)); |
1539 | "drive\n", | 1903 | printk(KERN_NOTICE "md/raid10:%s: %s: failing " |
1540 | mdname(mddev), | 1904 | "drive\n", |
1541 | bdevname(rdev->bdev, b)); | 1905 | mdname(mddev), |
1542 | md_error(mddev, rdev); | 1906 | bdevname(rdev->bdev, b)); |
1543 | } | ||
1544 | rdev_dec_pending(rdev, mddev); | ||
1545 | rcu_read_lock(); | ||
1546 | } | 1907 | } |
1908 | rdev_dec_pending(rdev, mddev); | ||
1909 | rcu_read_lock(); | ||
1547 | } | 1910 | } |
1548 | sl = start; | 1911 | sl = start; |
1549 | while (sl != r10_bio->read_slot) { | 1912 | while (sl != r10_bio->read_slot) { |
1913 | char b[BDEVNAME_SIZE]; | ||
1550 | 1914 | ||
1551 | if (sl==0) | 1915 | if (sl==0) |
1552 | sl = conf->copies; | 1916 | sl = conf->copies; |
1553 | sl--; | 1917 | sl--; |
1554 | d = r10_bio->devs[sl].devnum; | 1918 | d = r10_bio->devs[sl].devnum; |
1555 | rdev = rcu_dereference(conf->mirrors[d].rdev); | 1919 | rdev = rcu_dereference(conf->mirrors[d].rdev); |
1556 | if (rdev && | 1920 | if (!rdev || |
1557 | test_bit(In_sync, &rdev->flags)) { | 1921 | !test_bit(In_sync, &rdev->flags)) |
1558 | char b[BDEVNAME_SIZE]; | 1922 | continue; |
1559 | atomic_inc(&rdev->nr_pending); | ||
1560 | rcu_read_unlock(); | ||
1561 | if (sync_page_io(rdev, | ||
1562 | r10_bio->devs[sl].addr + | ||
1563 | sect, | ||
1564 | s<<9, conf->tmppage, | ||
1565 | READ, false) == 0) { | ||
1566 | /* Well, this device is dead */ | ||
1567 | printk(KERN_NOTICE | ||
1568 | "md/raid10:%s: unable to read back " | ||
1569 | "corrected sectors" | ||
1570 | " (%d sectors at %llu on %s)\n", | ||
1571 | mdname(mddev), s, | ||
1572 | (unsigned long long)( | ||
1573 | sect + rdev->data_offset), | ||
1574 | bdevname(rdev->bdev, b)); | ||
1575 | printk(KERN_NOTICE "md/raid10:%s: %s: failing drive\n", | ||
1576 | mdname(mddev), | ||
1577 | bdevname(rdev->bdev, b)); | ||
1578 | |||
1579 | md_error(mddev, rdev); | ||
1580 | } else { | ||
1581 | printk(KERN_INFO | ||
1582 | "md/raid10:%s: read error corrected" | ||
1583 | " (%d sectors at %llu on %s)\n", | ||
1584 | mdname(mddev), s, | ||
1585 | (unsigned long long)( | ||
1586 | sect + rdev->data_offset), | ||
1587 | bdevname(rdev->bdev, b)); | ||
1588 | } | ||
1589 | 1923 | ||
1590 | rdev_dec_pending(rdev, mddev); | 1924 | atomic_inc(&rdev->nr_pending); |
1591 | rcu_read_lock(); | 1925 | rcu_read_unlock(); |
1926 | switch (r10_sync_page_io(rdev, | ||
1927 | r10_bio->devs[sl].addr + | ||
1928 | sect, | ||
1929 | s<<9, conf->tmppage, | ||
1930 | READ)) { | ||
1931 | case 0: | ||
1932 | /* Well, this device is dead */ | ||
1933 | printk(KERN_NOTICE | ||
1934 | "md/raid10:%s: unable to read back " | ||
1935 | "corrected sectors" | ||
1936 | " (%d sectors at %llu on %s)\n", | ||
1937 | mdname(mddev), s, | ||
1938 | (unsigned long long)( | ||
1939 | sect + rdev->data_offset), | ||
1940 | bdevname(rdev->bdev, b)); | ||
1941 | printk(KERN_NOTICE "md/raid10:%s: %s: failing " | ||
1942 | "drive\n", | ||
1943 | mdname(mddev), | ||
1944 | bdevname(rdev->bdev, b)); | ||
1945 | break; | ||
1946 | case 1: | ||
1947 | printk(KERN_INFO | ||
1948 | "md/raid10:%s: read error corrected" | ||
1949 | " (%d sectors at %llu on %s)\n", | ||
1950 | mdname(mddev), s, | ||
1951 | (unsigned long long)( | ||
1952 | sect + rdev->data_offset), | ||
1953 | bdevname(rdev->bdev, b)); | ||
1954 | atomic_add(s, &rdev->corrected_errors); | ||
1592 | } | 1955 | } |
1956 | |||
1957 | rdev_dec_pending(rdev, mddev); | ||
1958 | rcu_read_lock(); | ||
1593 | } | 1959 | } |
1594 | rcu_read_unlock(); | 1960 | rcu_read_unlock(); |
1595 | 1961 | ||
@@ -1598,21 +1964,254 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
1598 | } | 1964 | } |
1599 | } | 1965 | } |
1600 | 1966 | ||
1967 | static void bi_complete(struct bio *bio, int error) | ||
1968 | { | ||
1969 | complete((struct completion *)bio->bi_private); | ||
1970 | } | ||
1971 | |||
1972 | static int submit_bio_wait(int rw, struct bio *bio) | ||
1973 | { | ||
1974 | struct completion event; | ||
1975 | rw |= REQ_SYNC; | ||
1976 | |||
1977 | init_completion(&event); | ||
1978 | bio->bi_private = &event; | ||
1979 | bio->bi_end_io = bi_complete; | ||
1980 | submit_bio(rw, bio); | ||
1981 | wait_for_completion(&event); | ||
1982 | |||
1983 | return test_bit(BIO_UPTODATE, &bio->bi_flags); | ||
1984 | } | ||
1985 | |||
1986 | static int narrow_write_error(r10bio_t *r10_bio, int i) | ||
1987 | { | ||
1988 | struct bio *bio = r10_bio->master_bio; | ||
1989 | mddev_t *mddev = r10_bio->mddev; | ||
1990 | conf_t *conf = mddev->private; | ||
1991 | mdk_rdev_t *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev; | ||
1992 | /* bio has the data to be written to slot 'i' where | ||
1993 | * we just recently had a write error. | ||
1994 | * We repeatedly clone the bio and trim down to one block, | ||
1995 | * then try the write. Where the write fails we record | ||
1996 | * a bad block. | ||
1997 | * It is conceivable that the bio doesn't exactly align with | ||
1998 | * blocks. We must handle this. | ||
1999 | * | ||
2000 | * We currently own a reference to the rdev. | ||
2001 | */ | ||
2002 | |||
2003 | int block_sectors; | ||
2004 | sector_t sector; | ||
2005 | int sectors; | ||
2006 | int sect_to_write = r10_bio->sectors; | ||
2007 | int ok = 1; | ||
2008 | |||
2009 | if (rdev->badblocks.shift < 0) | ||
2010 | return 0; | ||
2011 | |||
2012 | block_sectors = 1 << rdev->badblocks.shift; | ||
2013 | sector = r10_bio->sector; | ||
2014 | sectors = ((r10_bio->sector + block_sectors) | ||
2015 | & ~(sector_t)(block_sectors - 1)) | ||
2016 | - sector; | ||
2017 | |||
2018 | while (sect_to_write) { | ||
2019 | struct bio *wbio; | ||
2020 | if (sectors > sect_to_write) | ||
2021 | sectors = sect_to_write; | ||
2022 | /* Write at 'sector' for 'sectors' */ | ||
2023 | wbio = bio_clone_mddev(bio, GFP_NOIO, mddev); | ||
2024 | md_trim_bio(wbio, sector - bio->bi_sector, sectors); | ||
2025 | wbio->bi_sector = (r10_bio->devs[i].addr+ | ||
2026 | rdev->data_offset+ | ||
2027 | (sector - r10_bio->sector)); | ||
2028 | wbio->bi_bdev = rdev->bdev; | ||
2029 | if (submit_bio_wait(WRITE, wbio) == 0) | ||
2030 | /* Failure! */ | ||
2031 | ok = rdev_set_badblocks(rdev, sector, | ||
2032 | sectors, 0) | ||
2033 | && ok; | ||
2034 | |||
2035 | bio_put(wbio); | ||
2036 | sect_to_write -= sectors; | ||
2037 | sector += sectors; | ||
2038 | sectors = block_sectors; | ||
2039 | } | ||
2040 | return ok; | ||
2041 | } | ||
2042 | |||
2043 | static void handle_read_error(mddev_t *mddev, r10bio_t *r10_bio) | ||
2044 | { | ||
2045 | int slot = r10_bio->read_slot; | ||
2046 | int mirror = r10_bio->devs[slot].devnum; | ||
2047 | struct bio *bio; | ||
2048 | conf_t *conf = mddev->private; | ||
2049 | mdk_rdev_t *rdev; | ||
2050 | char b[BDEVNAME_SIZE]; | ||
2051 | unsigned long do_sync; | ||
2052 | int max_sectors; | ||
2053 | |||
2054 | /* we got a read error. Maybe the drive is bad. Maybe just | ||
2055 | * the block and we can fix it. | ||
2056 | * We freeze all other IO, and try reading the block from | ||
2057 | * other devices. When we find one, we re-write | ||
2058 | * and check it that fixes the read error. | ||
2059 | * This is all done synchronously while the array is | ||
2060 | * frozen. | ||
2061 | */ | ||
2062 | if (mddev->ro == 0) { | ||
2063 | freeze_array(conf); | ||
2064 | fix_read_error(conf, mddev, r10_bio); | ||
2065 | unfreeze_array(conf); | ||
2066 | } | ||
2067 | rdev_dec_pending(conf->mirrors[mirror].rdev, mddev); | ||
2068 | |||
2069 | bio = r10_bio->devs[slot].bio; | ||
2070 | bdevname(bio->bi_bdev, b); | ||
2071 | r10_bio->devs[slot].bio = | ||
2072 | mddev->ro ? IO_BLOCKED : NULL; | ||
2073 | read_more: | ||
2074 | mirror = read_balance(conf, r10_bio, &max_sectors); | ||
2075 | if (mirror == -1) { | ||
2076 | printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O" | ||
2077 | " read error for block %llu\n", | ||
2078 | mdname(mddev), b, | ||
2079 | (unsigned long long)r10_bio->sector); | ||
2080 | raid_end_bio_io(r10_bio); | ||
2081 | bio_put(bio); | ||
2082 | return; | ||
2083 | } | ||
2084 | |||
2085 | do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC); | ||
2086 | if (bio) | ||
2087 | bio_put(bio); | ||
2088 | slot = r10_bio->read_slot; | ||
2089 | rdev = conf->mirrors[mirror].rdev; | ||
2090 | printk_ratelimited( | ||
2091 | KERN_ERR | ||
2092 | "md/raid10:%s: %s: redirecting" | ||
2093 | "sector %llu to another mirror\n", | ||
2094 | mdname(mddev), | ||
2095 | bdevname(rdev->bdev, b), | ||
2096 | (unsigned long long)r10_bio->sector); | ||
2097 | bio = bio_clone_mddev(r10_bio->master_bio, | ||
2098 | GFP_NOIO, mddev); | ||
2099 | md_trim_bio(bio, | ||
2100 | r10_bio->sector - bio->bi_sector, | ||
2101 | max_sectors); | ||
2102 | r10_bio->devs[slot].bio = bio; | ||
2103 | bio->bi_sector = r10_bio->devs[slot].addr | ||
2104 | + rdev->data_offset; | ||
2105 | bio->bi_bdev = rdev->bdev; | ||
2106 | bio->bi_rw = READ | do_sync; | ||
2107 | bio->bi_private = r10_bio; | ||
2108 | bio->bi_end_io = raid10_end_read_request; | ||
2109 | if (max_sectors < r10_bio->sectors) { | ||
2110 | /* Drat - have to split this up more */ | ||
2111 | struct bio *mbio = r10_bio->master_bio; | ||
2112 | int sectors_handled = | ||
2113 | r10_bio->sector + max_sectors | ||
2114 | - mbio->bi_sector; | ||
2115 | r10_bio->sectors = max_sectors; | ||
2116 | spin_lock_irq(&conf->device_lock); | ||
2117 | if (mbio->bi_phys_segments == 0) | ||
2118 | mbio->bi_phys_segments = 2; | ||
2119 | else | ||
2120 | mbio->bi_phys_segments++; | ||
2121 | spin_unlock_irq(&conf->device_lock); | ||
2122 | generic_make_request(bio); | ||
2123 | bio = NULL; | ||
2124 | |||
2125 | r10_bio = mempool_alloc(conf->r10bio_pool, | ||
2126 | GFP_NOIO); | ||
2127 | r10_bio->master_bio = mbio; | ||
2128 | r10_bio->sectors = (mbio->bi_size >> 9) | ||
2129 | - sectors_handled; | ||
2130 | r10_bio->state = 0; | ||
2131 | set_bit(R10BIO_ReadError, | ||
2132 | &r10_bio->state); | ||
2133 | r10_bio->mddev = mddev; | ||
2134 | r10_bio->sector = mbio->bi_sector | ||
2135 | + sectors_handled; | ||
2136 | |||
2137 | goto read_more; | ||
2138 | } else | ||
2139 | generic_make_request(bio); | ||
2140 | } | ||
2141 | |||
2142 | static void handle_write_completed(conf_t *conf, r10bio_t *r10_bio) | ||
2143 | { | ||
2144 | /* Some sort of write request has finished and it | ||
2145 | * succeeded in writing where we thought there was a | ||
2146 | * bad block. So forget the bad block. | ||
2147 | * Or possibly if failed and we need to record | ||
2148 | * a bad block. | ||
2149 | */ | ||
2150 | int m; | ||
2151 | mdk_rdev_t *rdev; | ||
2152 | |||
2153 | if (test_bit(R10BIO_IsSync, &r10_bio->state) || | ||
2154 | test_bit(R10BIO_IsRecover, &r10_bio->state)) { | ||
2155 | for (m = 0; m < conf->copies; m++) { | ||
2156 | int dev = r10_bio->devs[m].devnum; | ||
2157 | rdev = conf->mirrors[dev].rdev; | ||
2158 | if (r10_bio->devs[m].bio == NULL) | ||
2159 | continue; | ||
2160 | if (test_bit(BIO_UPTODATE, | ||
2161 | &r10_bio->devs[m].bio->bi_flags)) { | ||
2162 | rdev_clear_badblocks( | ||
2163 | rdev, | ||
2164 | r10_bio->devs[m].addr, | ||
2165 | r10_bio->sectors); | ||
2166 | } else { | ||
2167 | if (!rdev_set_badblocks( | ||
2168 | rdev, | ||
2169 | r10_bio->devs[m].addr, | ||
2170 | r10_bio->sectors, 0)) | ||
2171 | md_error(conf->mddev, rdev); | ||
2172 | } | ||
2173 | } | ||
2174 | put_buf(r10_bio); | ||
2175 | } else { | ||
2176 | for (m = 0; m < conf->copies; m++) { | ||
2177 | int dev = r10_bio->devs[m].devnum; | ||
2178 | struct bio *bio = r10_bio->devs[m].bio; | ||
2179 | rdev = conf->mirrors[dev].rdev; | ||
2180 | if (bio == IO_MADE_GOOD) { | ||
2181 | rdev_clear_badblocks( | ||
2182 | rdev, | ||
2183 | r10_bio->devs[m].addr, | ||
2184 | r10_bio->sectors); | ||
2185 | rdev_dec_pending(rdev, conf->mddev); | ||
2186 | } else if (bio != NULL && | ||
2187 | !test_bit(BIO_UPTODATE, &bio->bi_flags)) { | ||
2188 | if (!narrow_write_error(r10_bio, m)) { | ||
2189 | md_error(conf->mddev, rdev); | ||
2190 | set_bit(R10BIO_Degraded, | ||
2191 | &r10_bio->state); | ||
2192 | } | ||
2193 | rdev_dec_pending(rdev, conf->mddev); | ||
2194 | } | ||
2195 | } | ||
2196 | if (test_bit(R10BIO_WriteError, | ||
2197 | &r10_bio->state)) | ||
2198 | close_write(r10_bio); | ||
2199 | raid_end_bio_io(r10_bio); | ||
2200 | } | ||
2201 | } | ||
2202 | |||
1601 | static void raid10d(mddev_t *mddev) | 2203 | static void raid10d(mddev_t *mddev) |
1602 | { | 2204 | { |
1603 | r10bio_t *r10_bio; | 2205 | r10bio_t *r10_bio; |
1604 | struct bio *bio; | ||
1605 | unsigned long flags; | 2206 | unsigned long flags; |
1606 | conf_t *conf = mddev->private; | 2207 | conf_t *conf = mddev->private; |
1607 | struct list_head *head = &conf->retry_list; | 2208 | struct list_head *head = &conf->retry_list; |
1608 | mdk_rdev_t *rdev; | ||
1609 | struct blk_plug plug; | 2209 | struct blk_plug plug; |
1610 | 2210 | ||
1611 | md_check_recovery(mddev); | 2211 | md_check_recovery(mddev); |
1612 | 2212 | ||
1613 | blk_start_plug(&plug); | 2213 | blk_start_plug(&plug); |
1614 | for (;;) { | 2214 | for (;;) { |
1615 | char b[BDEVNAME_SIZE]; | ||
1616 | 2215 | ||
1617 | flush_pending_writes(conf); | 2216 | flush_pending_writes(conf); |
1618 | 2217 | ||
@@ -1628,64 +2227,26 @@ static void raid10d(mddev_t *mddev) | |||
1628 | 2227 | ||
1629 | mddev = r10_bio->mddev; | 2228 | mddev = r10_bio->mddev; |
1630 | conf = mddev->private; | 2229 | conf = mddev->private; |
1631 | if (test_bit(R10BIO_IsSync, &r10_bio->state)) | 2230 | if (test_bit(R10BIO_MadeGood, &r10_bio->state) || |
2231 | test_bit(R10BIO_WriteError, &r10_bio->state)) | ||
2232 | handle_write_completed(conf, r10_bio); | ||
2233 | else if (test_bit(R10BIO_IsSync, &r10_bio->state)) | ||
1632 | sync_request_write(mddev, r10_bio); | 2234 | sync_request_write(mddev, r10_bio); |
1633 | else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) | 2235 | else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) |
1634 | recovery_request_write(mddev, r10_bio); | 2236 | recovery_request_write(mddev, r10_bio); |
2237 | else if (test_bit(R10BIO_ReadError, &r10_bio->state)) | ||
2238 | handle_read_error(mddev, r10_bio); | ||
1635 | else { | 2239 | else { |
1636 | int slot = r10_bio->read_slot; | 2240 | /* just a partial read to be scheduled from a |
1637 | int mirror = r10_bio->devs[slot].devnum; | 2241 | * separate context |
1638 | /* we got a read error. Maybe the drive is bad. Maybe just | ||
1639 | * the block and we can fix it. | ||
1640 | * We freeze all other IO, and try reading the block from | ||
1641 | * other devices. When we find one, we re-write | ||
1642 | * and check it that fixes the read error. | ||
1643 | * This is all done synchronously while the array is | ||
1644 | * frozen. | ||
1645 | */ | 2242 | */ |
1646 | if (mddev->ro == 0) { | 2243 | int slot = r10_bio->read_slot; |
1647 | freeze_array(conf); | 2244 | generic_make_request(r10_bio->devs[slot].bio); |
1648 | fix_read_error(conf, mddev, r10_bio); | ||
1649 | unfreeze_array(conf); | ||
1650 | } | ||
1651 | rdev_dec_pending(conf->mirrors[mirror].rdev, mddev); | ||
1652 | |||
1653 | bio = r10_bio->devs[slot].bio; | ||
1654 | r10_bio->devs[slot].bio = | ||
1655 | mddev->ro ? IO_BLOCKED : NULL; | ||
1656 | mirror = read_balance(conf, r10_bio); | ||
1657 | if (mirror == -1) { | ||
1658 | printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O" | ||
1659 | " read error for block %llu\n", | ||
1660 | mdname(mddev), | ||
1661 | bdevname(bio->bi_bdev,b), | ||
1662 | (unsigned long long)r10_bio->sector); | ||
1663 | raid_end_bio_io(r10_bio); | ||
1664 | bio_put(bio); | ||
1665 | } else { | ||
1666 | const unsigned long do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC); | ||
1667 | bio_put(bio); | ||
1668 | slot = r10_bio->read_slot; | ||
1669 | rdev = conf->mirrors[mirror].rdev; | ||
1670 | if (printk_ratelimit()) | ||
1671 | printk(KERN_ERR "md/raid10:%s: %s: redirecting sector %llu to" | ||
1672 | " another mirror\n", | ||
1673 | mdname(mddev), | ||
1674 | bdevname(rdev->bdev,b), | ||
1675 | (unsigned long long)r10_bio->sector); | ||
1676 | bio = bio_clone_mddev(r10_bio->master_bio, | ||
1677 | GFP_NOIO, mddev); | ||
1678 | r10_bio->devs[slot].bio = bio; | ||
1679 | bio->bi_sector = r10_bio->devs[slot].addr | ||
1680 | + rdev->data_offset; | ||
1681 | bio->bi_bdev = rdev->bdev; | ||
1682 | bio->bi_rw = READ | do_sync; | ||
1683 | bio->bi_private = r10_bio; | ||
1684 | bio->bi_end_io = raid10_end_read_request; | ||
1685 | generic_make_request(bio); | ||
1686 | } | ||
1687 | } | 2245 | } |
2246 | |||
1688 | cond_resched(); | 2247 | cond_resched(); |
2248 | if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) | ||
2249 | md_check_recovery(mddev); | ||
1689 | } | 2250 | } |
1690 | blk_finish_plug(&plug); | 2251 | blk_finish_plug(&plug); |
1691 | } | 2252 | } |
@@ -1746,7 +2307,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, | |||
1746 | int i; | 2307 | int i; |
1747 | int max_sync; | 2308 | int max_sync; |
1748 | sector_t sync_blocks; | 2309 | sector_t sync_blocks; |
1749 | |||
1750 | sector_t sectors_skipped = 0; | 2310 | sector_t sectors_skipped = 0; |
1751 | int chunks_skipped = 0; | 2311 | int chunks_skipped = 0; |
1752 | 2312 | ||
@@ -1828,7 +2388,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, | |||
1828 | max_sync = RESYNC_PAGES << (PAGE_SHIFT-9); | 2388 | max_sync = RESYNC_PAGES << (PAGE_SHIFT-9); |
1829 | if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { | 2389 | if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { |
1830 | /* recovery... the complicated one */ | 2390 | /* recovery... the complicated one */ |
1831 | int j, k; | 2391 | int j; |
1832 | r10_bio = NULL; | 2392 | r10_bio = NULL; |
1833 | 2393 | ||
1834 | for (i=0 ; i<conf->raid_disks; i++) { | 2394 | for (i=0 ; i<conf->raid_disks; i++) { |
@@ -1836,6 +2396,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, | |||
1836 | r10bio_t *rb2; | 2396 | r10bio_t *rb2; |
1837 | sector_t sect; | 2397 | sector_t sect; |
1838 | int must_sync; | 2398 | int must_sync; |
2399 | int any_working; | ||
1839 | 2400 | ||
1840 | if (conf->mirrors[i].rdev == NULL || | 2401 | if (conf->mirrors[i].rdev == NULL || |
1841 | test_bit(In_sync, &conf->mirrors[i].rdev->flags)) | 2402 | test_bit(In_sync, &conf->mirrors[i].rdev->flags)) |
@@ -1887,19 +2448,42 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, | |||
1887 | must_sync = bitmap_start_sync(mddev->bitmap, sect, | 2448 | must_sync = bitmap_start_sync(mddev->bitmap, sect, |
1888 | &sync_blocks, still_degraded); | 2449 | &sync_blocks, still_degraded); |
1889 | 2450 | ||
2451 | any_working = 0; | ||
1890 | for (j=0; j<conf->copies;j++) { | 2452 | for (j=0; j<conf->copies;j++) { |
2453 | int k; | ||
1891 | int d = r10_bio->devs[j].devnum; | 2454 | int d = r10_bio->devs[j].devnum; |
2455 | sector_t from_addr, to_addr; | ||
2456 | mdk_rdev_t *rdev; | ||
2457 | sector_t sector, first_bad; | ||
2458 | int bad_sectors; | ||
1892 | if (!conf->mirrors[d].rdev || | 2459 | if (!conf->mirrors[d].rdev || |
1893 | !test_bit(In_sync, &conf->mirrors[d].rdev->flags)) | 2460 | !test_bit(In_sync, &conf->mirrors[d].rdev->flags)) |
1894 | continue; | 2461 | continue; |
1895 | /* This is where we read from */ | 2462 | /* This is where we read from */ |
2463 | any_working = 1; | ||
2464 | rdev = conf->mirrors[d].rdev; | ||
2465 | sector = r10_bio->devs[j].addr; | ||
2466 | |||
2467 | if (is_badblock(rdev, sector, max_sync, | ||
2468 | &first_bad, &bad_sectors)) { | ||
2469 | if (first_bad > sector) | ||
2470 | max_sync = first_bad - sector; | ||
2471 | else { | ||
2472 | bad_sectors -= (sector | ||
2473 | - first_bad); | ||
2474 | if (max_sync > bad_sectors) | ||
2475 | max_sync = bad_sectors; | ||
2476 | continue; | ||
2477 | } | ||
2478 | } | ||
1896 | bio = r10_bio->devs[0].bio; | 2479 | bio = r10_bio->devs[0].bio; |
1897 | bio->bi_next = biolist; | 2480 | bio->bi_next = biolist; |
1898 | biolist = bio; | 2481 | biolist = bio; |
1899 | bio->bi_private = r10_bio; | 2482 | bio->bi_private = r10_bio; |
1900 | bio->bi_end_io = end_sync_read; | 2483 | bio->bi_end_io = end_sync_read; |
1901 | bio->bi_rw = READ; | 2484 | bio->bi_rw = READ; |
1902 | bio->bi_sector = r10_bio->devs[j].addr + | 2485 | from_addr = r10_bio->devs[j].addr; |
2486 | bio->bi_sector = from_addr + | ||
1903 | conf->mirrors[d].rdev->data_offset; | 2487 | conf->mirrors[d].rdev->data_offset; |
1904 | bio->bi_bdev = conf->mirrors[d].rdev->bdev; | 2488 | bio->bi_bdev = conf->mirrors[d].rdev->bdev; |
1905 | atomic_inc(&conf->mirrors[d].rdev->nr_pending); | 2489 | atomic_inc(&conf->mirrors[d].rdev->nr_pending); |
@@ -1916,26 +2500,48 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, | |||
1916 | bio->bi_private = r10_bio; | 2500 | bio->bi_private = r10_bio; |
1917 | bio->bi_end_io = end_sync_write; | 2501 | bio->bi_end_io = end_sync_write; |
1918 | bio->bi_rw = WRITE; | 2502 | bio->bi_rw = WRITE; |
1919 | bio->bi_sector = r10_bio->devs[k].addr + | 2503 | to_addr = r10_bio->devs[k].addr; |
2504 | bio->bi_sector = to_addr + | ||
1920 | conf->mirrors[i].rdev->data_offset; | 2505 | conf->mirrors[i].rdev->data_offset; |
1921 | bio->bi_bdev = conf->mirrors[i].rdev->bdev; | 2506 | bio->bi_bdev = conf->mirrors[i].rdev->bdev; |
1922 | 2507 | ||
1923 | r10_bio->devs[0].devnum = d; | 2508 | r10_bio->devs[0].devnum = d; |
2509 | r10_bio->devs[0].addr = from_addr; | ||
1924 | r10_bio->devs[1].devnum = i; | 2510 | r10_bio->devs[1].devnum = i; |
2511 | r10_bio->devs[1].addr = to_addr; | ||
1925 | 2512 | ||
1926 | break; | 2513 | break; |
1927 | } | 2514 | } |
1928 | if (j == conf->copies) { | 2515 | if (j == conf->copies) { |
1929 | /* Cannot recover, so abort the recovery */ | 2516 | /* Cannot recover, so abort the recovery or |
2517 | * record a bad block */ | ||
1930 | put_buf(r10_bio); | 2518 | put_buf(r10_bio); |
1931 | if (rb2) | 2519 | if (rb2) |
1932 | atomic_dec(&rb2->remaining); | 2520 | atomic_dec(&rb2->remaining); |
1933 | r10_bio = rb2; | 2521 | r10_bio = rb2; |
1934 | if (!test_and_set_bit(MD_RECOVERY_INTR, | 2522 | if (any_working) { |
1935 | &mddev->recovery)) | 2523 | /* problem is that there are bad blocks |
1936 | printk(KERN_INFO "md/raid10:%s: insufficient " | 2524 | * on other device(s) |
1937 | "working devices for recovery.\n", | 2525 | */ |
1938 | mdname(mddev)); | 2526 | int k; |
2527 | for (k = 0; k < conf->copies; k++) | ||
2528 | if (r10_bio->devs[k].devnum == i) | ||
2529 | break; | ||
2530 | if (!rdev_set_badblocks( | ||
2531 | conf->mirrors[i].rdev, | ||
2532 | r10_bio->devs[k].addr, | ||
2533 | max_sync, 0)) | ||
2534 | any_working = 0; | ||
2535 | } | ||
2536 | if (!any_working) { | ||
2537 | if (!test_and_set_bit(MD_RECOVERY_INTR, | ||
2538 | &mddev->recovery)) | ||
2539 | printk(KERN_INFO "md/raid10:%s: insufficient " | ||
2540 | "working devices for recovery.\n", | ||
2541 | mdname(mddev)); | ||
2542 | conf->mirrors[i].recovery_disabled | ||
2543 | = mddev->recovery_disabled; | ||
2544 | } | ||
1939 | break; | 2545 | break; |
1940 | } | 2546 | } |
1941 | } | 2547 | } |
@@ -1979,12 +2585,28 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, | |||
1979 | 2585 | ||
1980 | for (i=0; i<conf->copies; i++) { | 2586 | for (i=0; i<conf->copies; i++) { |
1981 | int d = r10_bio->devs[i].devnum; | 2587 | int d = r10_bio->devs[i].devnum; |
2588 | sector_t first_bad, sector; | ||
2589 | int bad_sectors; | ||
2590 | |||
1982 | bio = r10_bio->devs[i].bio; | 2591 | bio = r10_bio->devs[i].bio; |
1983 | bio->bi_end_io = NULL; | 2592 | bio->bi_end_io = NULL; |
1984 | clear_bit(BIO_UPTODATE, &bio->bi_flags); | 2593 | clear_bit(BIO_UPTODATE, &bio->bi_flags); |
1985 | if (conf->mirrors[d].rdev == NULL || | 2594 | if (conf->mirrors[d].rdev == NULL || |
1986 | test_bit(Faulty, &conf->mirrors[d].rdev->flags)) | 2595 | test_bit(Faulty, &conf->mirrors[d].rdev->flags)) |
1987 | continue; | 2596 | continue; |
2597 | sector = r10_bio->devs[i].addr; | ||
2598 | if (is_badblock(conf->mirrors[d].rdev, | ||
2599 | sector, max_sync, | ||
2600 | &first_bad, &bad_sectors)) { | ||
2601 | if (first_bad > sector) | ||
2602 | max_sync = first_bad - sector; | ||
2603 | else { | ||
2604 | bad_sectors -= (sector - first_bad); | ||
2605 | if (max_sync > bad_sectors) | ||
2606 | max_sync = max_sync; | ||
2607 | continue; | ||
2608 | } | ||
2609 | } | ||
1988 | atomic_inc(&conf->mirrors[d].rdev->nr_pending); | 2610 | atomic_inc(&conf->mirrors[d].rdev->nr_pending); |
1989 | atomic_inc(&r10_bio->remaining); | 2611 | atomic_inc(&r10_bio->remaining); |
1990 | bio->bi_next = biolist; | 2612 | bio->bi_next = biolist; |
@@ -1992,7 +2614,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, | |||
1992 | bio->bi_private = r10_bio; | 2614 | bio->bi_private = r10_bio; |
1993 | bio->bi_end_io = end_sync_read; | 2615 | bio->bi_end_io = end_sync_read; |
1994 | bio->bi_rw = READ; | 2616 | bio->bi_rw = READ; |
1995 | bio->bi_sector = r10_bio->devs[i].addr + | 2617 | bio->bi_sector = sector + |
1996 | conf->mirrors[d].rdev->data_offset; | 2618 | conf->mirrors[d].rdev->data_offset; |
1997 | bio->bi_bdev = conf->mirrors[d].rdev->bdev; | 2619 | bio->bi_bdev = conf->mirrors[d].rdev->bdev; |
1998 | count++; | 2620 | count++; |
@@ -2079,7 +2701,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, | |||
2079 | return sectors_skipped + nr_sectors; | 2701 | return sectors_skipped + nr_sectors; |
2080 | giveup: | 2702 | giveup: |
2081 | /* There is nowhere to write, so all non-sync | 2703 | /* There is nowhere to write, so all non-sync |
2082 | * drives must be failed, so try the next chunk... | 2704 | * drives must be failed or in resync, all drives |
2705 | * have a bad block, so try the next chunk... | ||
2083 | */ | 2706 | */ |
2084 | if (sector_nr + max_sync < max_sector) | 2707 | if (sector_nr + max_sync < max_sector) |
2085 | max_sector = sector_nr + max_sync; | 2708 | max_sector = sector_nr + max_sync; |
@@ -2249,6 +2872,7 @@ static int run(mddev_t *mddev) | |||
2249 | (conf->raid_disks / conf->near_copies)); | 2872 | (conf->raid_disks / conf->near_copies)); |
2250 | 2873 | ||
2251 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 2874 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
2875 | |||
2252 | disk_idx = rdev->raid_disk; | 2876 | disk_idx = rdev->raid_disk; |
2253 | if (disk_idx >= conf->raid_disks | 2877 | if (disk_idx >= conf->raid_disks |
2254 | || disk_idx < 0) | 2878 | || disk_idx < 0) |
@@ -2271,7 +2895,7 @@ static int run(mddev_t *mddev) | |||
2271 | disk->head_position = 0; | 2895 | disk->head_position = 0; |
2272 | } | 2896 | } |
2273 | /* need to check that every block has at least one working mirror */ | 2897 | /* need to check that every block has at least one working mirror */ |
2274 | if (!enough(conf)) { | 2898 | if (!enough(conf, -1)) { |
2275 | printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n", | 2899 | printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n", |
2276 | mdname(mddev)); | 2900 | mdname(mddev)); |
2277 | goto out_free_conf; | 2901 | goto out_free_conf; |
@@ -2331,7 +2955,7 @@ static int run(mddev_t *mddev) | |||
2331 | return 0; | 2955 | return 0; |
2332 | 2956 | ||
2333 | out_free_conf: | 2957 | out_free_conf: |
2334 | md_unregister_thread(mddev->thread); | 2958 | md_unregister_thread(&mddev->thread); |
2335 | if (conf->r10bio_pool) | 2959 | if (conf->r10bio_pool) |
2336 | mempool_destroy(conf->r10bio_pool); | 2960 | mempool_destroy(conf->r10bio_pool); |
2337 | safe_put_page(conf->tmppage); | 2961 | safe_put_page(conf->tmppage); |
@@ -2349,8 +2973,7 @@ static int stop(mddev_t *mddev) | |||
2349 | raise_barrier(conf, 0); | 2973 | raise_barrier(conf, 0); |
2350 | lower_barrier(conf); | 2974 | lower_barrier(conf); |
2351 | 2975 | ||
2352 | md_unregister_thread(mddev->thread); | 2976 | md_unregister_thread(&mddev->thread); |
2353 | mddev->thread = NULL; | ||
2354 | blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ | 2977 | blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ |
2355 | if (conf->r10bio_pool) | 2978 | if (conf->r10bio_pool) |
2356 | mempool_destroy(conf->r10bio_pool); | 2979 | mempool_destroy(conf->r10bio_pool); |
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 944b1104d3b..79cb52a0d4a 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h | |||
@@ -6,6 +6,11 @@ typedef struct mirror_info mirror_info_t; | |||
6 | struct mirror_info { | 6 | struct mirror_info { |
7 | mdk_rdev_t *rdev; | 7 | mdk_rdev_t *rdev; |
8 | sector_t head_position; | 8 | sector_t head_position; |
9 | int recovery_disabled; /* matches | ||
10 | * mddev->recovery_disabled | ||
11 | * when we shouldn't try | ||
12 | * recovering this device. | ||
13 | */ | ||
9 | }; | 14 | }; |
10 | 15 | ||
11 | typedef struct r10bio_s r10bio_t; | 16 | typedef struct r10bio_s r10bio_t; |
@@ -113,10 +118,26 @@ struct r10bio_s { | |||
113 | * level, we store IO_BLOCKED in the appropriate 'bios' pointer | 118 | * level, we store IO_BLOCKED in the appropriate 'bios' pointer |
114 | */ | 119 | */ |
115 | #define IO_BLOCKED ((struct bio*)1) | 120 | #define IO_BLOCKED ((struct bio*)1) |
121 | /* When we successfully write to a known bad-block, we need to remove the | ||
122 | * bad-block marking which must be done from process context. So we record | ||
123 | * the success by setting devs[n].bio to IO_MADE_GOOD | ||
124 | */ | ||
125 | #define IO_MADE_GOOD ((struct bio *)2) | ||
126 | |||
127 | #define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) | ||
116 | 128 | ||
117 | /* bits for r10bio.state */ | 129 | /* bits for r10bio.state */ |
118 | #define R10BIO_Uptodate 0 | 130 | #define R10BIO_Uptodate 0 |
119 | #define R10BIO_IsSync 1 | 131 | #define R10BIO_IsSync 1 |
120 | #define R10BIO_IsRecover 2 | 132 | #define R10BIO_IsRecover 2 |
121 | #define R10BIO_Degraded 3 | 133 | #define R10BIO_Degraded 3 |
134 | /* Set ReadError on bios that experience a read error | ||
135 | * so that raid10d knows what to do with them. | ||
136 | */ | ||
137 | #define R10BIO_ReadError 4 | ||
138 | /* If a write for this request means we can clear some | ||
139 | * known-bad-block records, we set this flag. | ||
140 | */ | ||
141 | #define R10BIO_MadeGood 5 | ||
142 | #define R10BIO_WriteError 6 | ||
122 | #endif | 143 | #endif |
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index b72edf35ec5..b6200c3935c 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -51,6 +51,7 @@ | |||
51 | #include <linux/seq_file.h> | 51 | #include <linux/seq_file.h> |
52 | #include <linux/cpu.h> | 52 | #include <linux/cpu.h> |
53 | #include <linux/slab.h> | 53 | #include <linux/slab.h> |
54 | #include <linux/ratelimit.h> | ||
54 | #include "md.h" | 55 | #include "md.h" |
55 | #include "raid5.h" | 56 | #include "raid5.h" |
56 | #include "raid0.h" | 57 | #include "raid0.h" |
@@ -96,8 +97,6 @@ | |||
96 | #define __inline__ | 97 | #define __inline__ |
97 | #endif | 98 | #endif |
98 | 99 | ||
99 | #define printk_rl(args...) ((void) (printk_ratelimit() && printk(args))) | ||
100 | |||
101 | /* | 100 | /* |
102 | * We maintain a biased count of active stripes in the bottom 16 bits of | 101 | * We maintain a biased count of active stripes in the bottom 16 bits of |
103 | * bi_phys_segments, and a count of processed stripes in the upper 16 bits | 102 | * bi_phys_segments, and a count of processed stripes in the upper 16 bits |
@@ -341,7 +340,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) | |||
341 | (unsigned long long)sh->sector, i, dev->toread, | 340 | (unsigned long long)sh->sector, i, dev->toread, |
342 | dev->read, dev->towrite, dev->written, | 341 | dev->read, dev->towrite, dev->written, |
343 | test_bit(R5_LOCKED, &dev->flags)); | 342 | test_bit(R5_LOCKED, &dev->flags)); |
344 | BUG(); | 343 | WARN_ON(1); |
345 | } | 344 | } |
346 | dev->flags = 0; | 345 | dev->flags = 0; |
347 | raid5_build_block(sh, i, previous); | 346 | raid5_build_block(sh, i, previous); |
@@ -527,6 +526,36 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
527 | atomic_inc(&rdev->nr_pending); | 526 | atomic_inc(&rdev->nr_pending); |
528 | rcu_read_unlock(); | 527 | rcu_read_unlock(); |
529 | 528 | ||
529 | /* We have already checked bad blocks for reads. Now | ||
530 | * need to check for writes. | ||
531 | */ | ||
532 | while ((rw & WRITE) && rdev && | ||
533 | test_bit(WriteErrorSeen, &rdev->flags)) { | ||
534 | sector_t first_bad; | ||
535 | int bad_sectors; | ||
536 | int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, | ||
537 | &first_bad, &bad_sectors); | ||
538 | if (!bad) | ||
539 | break; | ||
540 | |||
541 | if (bad < 0) { | ||
542 | set_bit(BlockedBadBlocks, &rdev->flags); | ||
543 | if (!conf->mddev->external && | ||
544 | conf->mddev->flags) { | ||
545 | /* It is very unlikely, but we might | ||
546 | * still need to write out the | ||
547 | * bad block log - better give it | ||
548 | * a chance*/ | ||
549 | md_check_recovery(conf->mddev); | ||
550 | } | ||
551 | md_wait_for_blocked_rdev(rdev, conf->mddev); | ||
552 | } else { | ||
553 | /* Acknowledged bad block - skip the write */ | ||
554 | rdev_dec_pending(rdev, conf->mddev); | ||
555 | rdev = NULL; | ||
556 | } | ||
557 | } | ||
558 | |||
530 | if (rdev) { | 559 | if (rdev) { |
531 | if (s->syncing || s->expanding || s->expanded) | 560 | if (s->syncing || s->expanding || s->expanded) |
532 | md_sync_acct(rdev->bdev, STRIPE_SECTORS); | 561 | md_sync_acct(rdev->bdev, STRIPE_SECTORS); |
@@ -548,10 +577,6 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
548 | bi->bi_io_vec[0].bv_offset = 0; | 577 | bi->bi_io_vec[0].bv_offset = 0; |
549 | bi->bi_size = STRIPE_SIZE; | 578 | bi->bi_size = STRIPE_SIZE; |
550 | bi->bi_next = NULL; | 579 | bi->bi_next = NULL; |
551 | if ((rw & WRITE) && | ||
552 | test_bit(R5_ReWrite, &sh->dev[i].flags)) | ||
553 | atomic_add(STRIPE_SECTORS, | ||
554 | &rdev->corrected_errors); | ||
555 | generic_make_request(bi); | 580 | generic_make_request(bi); |
556 | } else { | 581 | } else { |
557 | if (rw & WRITE) | 582 | if (rw & WRITE) |
@@ -1020,12 +1045,12 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | |||
1020 | if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { | 1045 | if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { |
1021 | struct bio *wbi; | 1046 | struct bio *wbi; |
1022 | 1047 | ||
1023 | spin_lock(&sh->lock); | 1048 | spin_lock_irq(&sh->raid_conf->device_lock); |
1024 | chosen = dev->towrite; | 1049 | chosen = dev->towrite; |
1025 | dev->towrite = NULL; | 1050 | dev->towrite = NULL; |
1026 | BUG_ON(dev->written); | 1051 | BUG_ON(dev->written); |
1027 | wbi = dev->written = chosen; | 1052 | wbi = dev->written = chosen; |
1028 | spin_unlock(&sh->lock); | 1053 | spin_unlock_irq(&sh->raid_conf->device_lock); |
1029 | 1054 | ||
1030 | while (wbi && wbi->bi_sector < | 1055 | while (wbi && wbi->bi_sector < |
1031 | dev->sector + STRIPE_SECTORS) { | 1056 | dev->sector + STRIPE_SECTORS) { |
@@ -1315,12 +1340,11 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) | |||
1315 | static int grow_one_stripe(raid5_conf_t *conf) | 1340 | static int grow_one_stripe(raid5_conf_t *conf) |
1316 | { | 1341 | { |
1317 | struct stripe_head *sh; | 1342 | struct stripe_head *sh; |
1318 | sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL); | 1343 | sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL); |
1319 | if (!sh) | 1344 | if (!sh) |
1320 | return 0; | 1345 | return 0; |
1321 | memset(sh, 0, sizeof(*sh) + (conf->pool_size-1)*sizeof(struct r5dev)); | 1346 | |
1322 | sh->raid_conf = conf; | 1347 | sh->raid_conf = conf; |
1323 | spin_lock_init(&sh->lock); | ||
1324 | #ifdef CONFIG_MULTICORE_RAID456 | 1348 | #ifdef CONFIG_MULTICORE_RAID456 |
1325 | init_waitqueue_head(&sh->ops.wait_for_ops); | 1349 | init_waitqueue_head(&sh->ops.wait_for_ops); |
1326 | #endif | 1350 | #endif |
@@ -1435,14 +1459,11 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) | |||
1435 | return -ENOMEM; | 1459 | return -ENOMEM; |
1436 | 1460 | ||
1437 | for (i = conf->max_nr_stripes; i; i--) { | 1461 | for (i = conf->max_nr_stripes; i; i--) { |
1438 | nsh = kmem_cache_alloc(sc, GFP_KERNEL); | 1462 | nsh = kmem_cache_zalloc(sc, GFP_KERNEL); |
1439 | if (!nsh) | 1463 | if (!nsh) |
1440 | break; | 1464 | break; |
1441 | 1465 | ||
1442 | memset(nsh, 0, sizeof(*nsh) + (newsize-1)*sizeof(struct r5dev)); | ||
1443 | |||
1444 | nsh->raid_conf = conf; | 1466 | nsh->raid_conf = conf; |
1445 | spin_lock_init(&nsh->lock); | ||
1446 | #ifdef CONFIG_MULTICORE_RAID456 | 1467 | #ifdef CONFIG_MULTICORE_RAID456 |
1447 | init_waitqueue_head(&nsh->ops.wait_for_ops); | 1468 | init_waitqueue_head(&nsh->ops.wait_for_ops); |
1448 | #endif | 1469 | #endif |
@@ -1587,12 +1608,15 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
1587 | set_bit(R5_UPTODATE, &sh->dev[i].flags); | 1608 | set_bit(R5_UPTODATE, &sh->dev[i].flags); |
1588 | if (test_bit(R5_ReadError, &sh->dev[i].flags)) { | 1609 | if (test_bit(R5_ReadError, &sh->dev[i].flags)) { |
1589 | rdev = conf->disks[i].rdev; | 1610 | rdev = conf->disks[i].rdev; |
1590 | printk_rl(KERN_INFO "md/raid:%s: read error corrected" | 1611 | printk_ratelimited( |
1591 | " (%lu sectors at %llu on %s)\n", | 1612 | KERN_INFO |
1592 | mdname(conf->mddev), STRIPE_SECTORS, | 1613 | "md/raid:%s: read error corrected" |
1593 | (unsigned long long)(sh->sector | 1614 | " (%lu sectors at %llu on %s)\n", |
1594 | + rdev->data_offset), | 1615 | mdname(conf->mddev), STRIPE_SECTORS, |
1595 | bdevname(rdev->bdev, b)); | 1616 | (unsigned long long)(sh->sector |
1617 | + rdev->data_offset), | ||
1618 | bdevname(rdev->bdev, b)); | ||
1619 | atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); | ||
1596 | clear_bit(R5_ReadError, &sh->dev[i].flags); | 1620 | clear_bit(R5_ReadError, &sh->dev[i].flags); |
1597 | clear_bit(R5_ReWrite, &sh->dev[i].flags); | 1621 | clear_bit(R5_ReWrite, &sh->dev[i].flags); |
1598 | } | 1622 | } |
@@ -1606,22 +1630,24 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
1606 | clear_bit(R5_UPTODATE, &sh->dev[i].flags); | 1630 | clear_bit(R5_UPTODATE, &sh->dev[i].flags); |
1607 | atomic_inc(&rdev->read_errors); | 1631 | atomic_inc(&rdev->read_errors); |
1608 | if (conf->mddev->degraded >= conf->max_degraded) | 1632 | if (conf->mddev->degraded >= conf->max_degraded) |
1609 | printk_rl(KERN_WARNING | 1633 | printk_ratelimited( |
1610 | "md/raid:%s: read error not correctable " | 1634 | KERN_WARNING |
1611 | "(sector %llu on %s).\n", | 1635 | "md/raid:%s: read error not correctable " |
1612 | mdname(conf->mddev), | 1636 | "(sector %llu on %s).\n", |
1613 | (unsigned long long)(sh->sector | 1637 | mdname(conf->mddev), |
1614 | + rdev->data_offset), | 1638 | (unsigned long long)(sh->sector |
1615 | bdn); | 1639 | + rdev->data_offset), |
1640 | bdn); | ||
1616 | else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) | 1641 | else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) |
1617 | /* Oh, no!!! */ | 1642 | /* Oh, no!!! */ |
1618 | printk_rl(KERN_WARNING | 1643 | printk_ratelimited( |
1619 | "md/raid:%s: read error NOT corrected!! " | 1644 | KERN_WARNING |
1620 | "(sector %llu on %s).\n", | 1645 | "md/raid:%s: read error NOT corrected!! " |
1621 | mdname(conf->mddev), | 1646 | "(sector %llu on %s).\n", |
1622 | (unsigned long long)(sh->sector | 1647 | mdname(conf->mddev), |
1623 | + rdev->data_offset), | 1648 | (unsigned long long)(sh->sector |
1624 | bdn); | 1649 | + rdev->data_offset), |
1650 | bdn); | ||
1625 | else if (atomic_read(&rdev->read_errors) | 1651 | else if (atomic_read(&rdev->read_errors) |
1626 | > conf->max_nr_stripes) | 1652 | > conf->max_nr_stripes) |
1627 | printk(KERN_WARNING | 1653 | printk(KERN_WARNING |
@@ -1649,6 +1675,8 @@ static void raid5_end_write_request(struct bio *bi, int error) | |||
1649 | raid5_conf_t *conf = sh->raid_conf; | 1675 | raid5_conf_t *conf = sh->raid_conf; |
1650 | int disks = sh->disks, i; | 1676 | int disks = sh->disks, i; |
1651 | int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); | 1677 | int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); |
1678 | sector_t first_bad; | ||
1679 | int bad_sectors; | ||
1652 | 1680 | ||
1653 | for (i=0 ; i<disks; i++) | 1681 | for (i=0 ; i<disks; i++) |
1654 | if (bi == &sh->dev[i].req) | 1682 | if (bi == &sh->dev[i].req) |
@@ -1662,8 +1690,12 @@ static void raid5_end_write_request(struct bio *bi, int error) | |||
1662 | return; | 1690 | return; |
1663 | } | 1691 | } |
1664 | 1692 | ||
1665 | if (!uptodate) | 1693 | if (!uptodate) { |
1666 | md_error(conf->mddev, conf->disks[i].rdev); | 1694 | set_bit(WriteErrorSeen, &conf->disks[i].rdev->flags); |
1695 | set_bit(R5_WriteError, &sh->dev[i].flags); | ||
1696 | } else if (is_badblock(conf->disks[i].rdev, sh->sector, STRIPE_SECTORS, | ||
1697 | &first_bad, &bad_sectors)) | ||
1698 | set_bit(R5_MadeGood, &sh->dev[i].flags); | ||
1667 | 1699 | ||
1668 | rdev_dec_pending(conf->disks[i].rdev, conf->mddev); | 1700 | rdev_dec_pending(conf->disks[i].rdev, conf->mddev); |
1669 | 1701 | ||
@@ -1710,6 +1742,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1710 | */ | 1742 | */ |
1711 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | 1743 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); |
1712 | } | 1744 | } |
1745 | set_bit(Blocked, &rdev->flags); | ||
1713 | set_bit(Faulty, &rdev->flags); | 1746 | set_bit(Faulty, &rdev->flags); |
1714 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 1747 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
1715 | printk(KERN_ALERT | 1748 | printk(KERN_ALERT |
@@ -1760,7 +1793,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, | |||
1760 | /* | 1793 | /* |
1761 | * Select the parity disk based on the user selected algorithm. | 1794 | * Select the parity disk based on the user selected algorithm. |
1762 | */ | 1795 | */ |
1763 | pd_idx = qd_idx = ~0; | 1796 | pd_idx = qd_idx = -1; |
1764 | switch(conf->level) { | 1797 | switch(conf->level) { |
1765 | case 4: | 1798 | case 4: |
1766 | pd_idx = data_disks; | 1799 | pd_idx = data_disks; |
@@ -2143,12 +2176,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
2143 | raid5_conf_t *conf = sh->raid_conf; | 2176 | raid5_conf_t *conf = sh->raid_conf; |
2144 | int firstwrite=0; | 2177 | int firstwrite=0; |
2145 | 2178 | ||
2146 | pr_debug("adding bh b#%llu to stripe s#%llu\n", | 2179 | pr_debug("adding bi b#%llu to stripe s#%llu\n", |
2147 | (unsigned long long)bi->bi_sector, | 2180 | (unsigned long long)bi->bi_sector, |
2148 | (unsigned long long)sh->sector); | 2181 | (unsigned long long)sh->sector); |
2149 | 2182 | ||
2150 | 2183 | ||
2151 | spin_lock(&sh->lock); | ||
2152 | spin_lock_irq(&conf->device_lock); | 2184 | spin_lock_irq(&conf->device_lock); |
2153 | if (forwrite) { | 2185 | if (forwrite) { |
2154 | bip = &sh->dev[dd_idx].towrite; | 2186 | bip = &sh->dev[dd_idx].towrite; |
@@ -2169,19 +2201,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
2169 | bi->bi_next = *bip; | 2201 | bi->bi_next = *bip; |
2170 | *bip = bi; | 2202 | *bip = bi; |
2171 | bi->bi_phys_segments++; | 2203 | bi->bi_phys_segments++; |
2172 | spin_unlock_irq(&conf->device_lock); | ||
2173 | spin_unlock(&sh->lock); | ||
2174 | |||
2175 | pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", | ||
2176 | (unsigned long long)bi->bi_sector, | ||
2177 | (unsigned long long)sh->sector, dd_idx); | ||
2178 | |||
2179 | if (conf->mddev->bitmap && firstwrite) { | ||
2180 | bitmap_startwrite(conf->mddev->bitmap, sh->sector, | ||
2181 | STRIPE_SECTORS, 0); | ||
2182 | sh->bm_seq = conf->seq_flush+1; | ||
2183 | set_bit(STRIPE_BIT_DELAY, &sh->state); | ||
2184 | } | ||
2185 | 2204 | ||
2186 | if (forwrite) { | 2205 | if (forwrite) { |
2187 | /* check if page is covered */ | 2206 | /* check if page is covered */ |
@@ -2196,12 +2215,23 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
2196 | if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) | 2215 | if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) |
2197 | set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); | 2216 | set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); |
2198 | } | 2217 | } |
2218 | spin_unlock_irq(&conf->device_lock); | ||
2219 | |||
2220 | pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", | ||
2221 | (unsigned long long)(*bip)->bi_sector, | ||
2222 | (unsigned long long)sh->sector, dd_idx); | ||
2223 | |||
2224 | if (conf->mddev->bitmap && firstwrite) { | ||
2225 | bitmap_startwrite(conf->mddev->bitmap, sh->sector, | ||
2226 | STRIPE_SECTORS, 0); | ||
2227 | sh->bm_seq = conf->seq_flush+1; | ||
2228 | set_bit(STRIPE_BIT_DELAY, &sh->state); | ||
2229 | } | ||
2199 | return 1; | 2230 | return 1; |
2200 | 2231 | ||
2201 | overlap: | 2232 | overlap: |
2202 | set_bit(R5_Overlap, &sh->dev[dd_idx].flags); | 2233 | set_bit(R5_Overlap, &sh->dev[dd_idx].flags); |
2203 | spin_unlock_irq(&conf->device_lock); | 2234 | spin_unlock_irq(&conf->device_lock); |
2204 | spin_unlock(&sh->lock); | ||
2205 | return 0; | 2235 | return 0; |
2206 | } | 2236 | } |
2207 | 2237 | ||
@@ -2238,9 +2268,18 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh, | |||
2238 | rcu_read_lock(); | 2268 | rcu_read_lock(); |
2239 | rdev = rcu_dereference(conf->disks[i].rdev); | 2269 | rdev = rcu_dereference(conf->disks[i].rdev); |
2240 | if (rdev && test_bit(In_sync, &rdev->flags)) | 2270 | if (rdev && test_bit(In_sync, &rdev->flags)) |
2241 | /* multiple read failures in one stripe */ | 2271 | atomic_inc(&rdev->nr_pending); |
2242 | md_error(conf->mddev, rdev); | 2272 | else |
2273 | rdev = NULL; | ||
2243 | rcu_read_unlock(); | 2274 | rcu_read_unlock(); |
2275 | if (rdev) { | ||
2276 | if (!rdev_set_badblocks( | ||
2277 | rdev, | ||
2278 | sh->sector, | ||
2279 | STRIPE_SECTORS, 0)) | ||
2280 | md_error(conf->mddev, rdev); | ||
2281 | rdev_dec_pending(rdev, conf->mddev); | ||
2282 | } | ||
2244 | } | 2283 | } |
2245 | spin_lock_irq(&conf->device_lock); | 2284 | spin_lock_irq(&conf->device_lock); |
2246 | /* fail all writes first */ | 2285 | /* fail all writes first */ |
@@ -2308,6 +2347,10 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh, | |||
2308 | if (bitmap_end) | 2347 | if (bitmap_end) |
2309 | bitmap_endwrite(conf->mddev->bitmap, sh->sector, | 2348 | bitmap_endwrite(conf->mddev->bitmap, sh->sector, |
2310 | STRIPE_SECTORS, 0, 0); | 2349 | STRIPE_SECTORS, 0, 0); |
2350 | /* If we were in the middle of a write the parity block might | ||
2351 | * still be locked - so just clear all R5_LOCKED flags | ||
2352 | */ | ||
2353 | clear_bit(R5_LOCKED, &sh->dev[i].flags); | ||
2311 | } | 2354 | } |
2312 | 2355 | ||
2313 | if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) | 2356 | if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) |
@@ -2315,109 +2358,73 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh, | |||
2315 | md_wakeup_thread(conf->mddev->thread); | 2358 | md_wakeup_thread(conf->mddev->thread); |
2316 | } | 2359 | } |
2317 | 2360 | ||
2318 | /* fetch_block5 - checks the given member device to see if its data needs | 2361 | static void |
2319 | * to be read or computed to satisfy a request. | 2362 | handle_failed_sync(raid5_conf_t *conf, struct stripe_head *sh, |
2320 | * | 2363 | struct stripe_head_state *s) |
2321 | * Returns 1 when no more member devices need to be checked, otherwise returns | ||
2322 | * 0 to tell the loop in handle_stripe_fill5 to continue | ||
2323 | */ | ||
2324 | static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s, | ||
2325 | int disk_idx, int disks) | ||
2326 | { | ||
2327 | struct r5dev *dev = &sh->dev[disk_idx]; | ||
2328 | struct r5dev *failed_dev = &sh->dev[s->failed_num]; | ||
2329 | |||
2330 | /* is the data in this block needed, and can we get it? */ | ||
2331 | if (!test_bit(R5_LOCKED, &dev->flags) && | ||
2332 | !test_bit(R5_UPTODATE, &dev->flags) && | ||
2333 | (dev->toread || | ||
2334 | (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || | ||
2335 | s->syncing || s->expanding || | ||
2336 | (s->failed && | ||
2337 | (failed_dev->toread || | ||
2338 | (failed_dev->towrite && | ||
2339 | !test_bit(R5_OVERWRITE, &failed_dev->flags)))))) { | ||
2340 | /* We would like to get this block, possibly by computing it, | ||
2341 | * otherwise read it if the backing disk is insync | ||
2342 | */ | ||
2343 | if ((s->uptodate == disks - 1) && | ||
2344 | (s->failed && disk_idx == s->failed_num)) { | ||
2345 | set_bit(STRIPE_COMPUTE_RUN, &sh->state); | ||
2346 | set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); | ||
2347 | set_bit(R5_Wantcompute, &dev->flags); | ||
2348 | sh->ops.target = disk_idx; | ||
2349 | sh->ops.target2 = -1; | ||
2350 | s->req_compute = 1; | ||
2351 | /* Careful: from this point on 'uptodate' is in the eye | ||
2352 | * of raid_run_ops which services 'compute' operations | ||
2353 | * before writes. R5_Wantcompute flags a block that will | ||
2354 | * be R5_UPTODATE by the time it is needed for a | ||
2355 | * subsequent operation. | ||
2356 | */ | ||
2357 | s->uptodate++; | ||
2358 | return 1; /* uptodate + compute == disks */ | ||
2359 | } else if (test_bit(R5_Insync, &dev->flags)) { | ||
2360 | set_bit(R5_LOCKED, &dev->flags); | ||
2361 | set_bit(R5_Wantread, &dev->flags); | ||
2362 | s->locked++; | ||
2363 | pr_debug("Reading block %d (sync=%d)\n", disk_idx, | ||
2364 | s->syncing); | ||
2365 | } | ||
2366 | } | ||
2367 | |||
2368 | return 0; | ||
2369 | } | ||
2370 | |||
2371 | /** | ||
2372 | * handle_stripe_fill5 - read or compute data to satisfy pending requests. | ||
2373 | */ | ||
2374 | static void handle_stripe_fill5(struct stripe_head *sh, | ||
2375 | struct stripe_head_state *s, int disks) | ||
2376 | { | 2364 | { |
2365 | int abort = 0; | ||
2377 | int i; | 2366 | int i; |
2378 | 2367 | ||
2379 | /* look for blocks to read/compute, skip this if a compute | 2368 | md_done_sync(conf->mddev, STRIPE_SECTORS, 0); |
2380 | * is already in flight, or if the stripe contents are in the | 2369 | clear_bit(STRIPE_SYNCING, &sh->state); |
2381 | * midst of changing due to a write | 2370 | s->syncing = 0; |
2371 | /* There is nothing more to do for sync/check/repair. | ||
2372 | * For recover we need to record a bad block on all | ||
2373 | * non-sync devices, or abort the recovery | ||
2382 | */ | 2374 | */ |
2383 | if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && | 2375 | if (!test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) |
2384 | !sh->reconstruct_state) | 2376 | return; |
2385 | for (i = disks; i--; ) | 2377 | /* During recovery devices cannot be removed, so locking and |
2386 | if (fetch_block5(sh, s, i, disks)) | 2378 | * refcounting of rdevs is not needed |
2387 | break; | 2379 | */ |
2388 | set_bit(STRIPE_HANDLE, &sh->state); | 2380 | for (i = 0; i < conf->raid_disks; i++) { |
2381 | mdk_rdev_t *rdev = conf->disks[i].rdev; | ||
2382 | if (!rdev | ||
2383 | || test_bit(Faulty, &rdev->flags) | ||
2384 | || test_bit(In_sync, &rdev->flags)) | ||
2385 | continue; | ||
2386 | if (!rdev_set_badblocks(rdev, sh->sector, | ||
2387 | STRIPE_SECTORS, 0)) | ||
2388 | abort = 1; | ||
2389 | } | ||
2390 | if (abort) { | ||
2391 | conf->recovery_disabled = conf->mddev->recovery_disabled; | ||
2392 | set_bit(MD_RECOVERY_INTR, &conf->mddev->recovery); | ||
2393 | } | ||
2389 | } | 2394 | } |
2390 | 2395 | ||
2391 | /* fetch_block6 - checks the given member device to see if its data needs | 2396 | /* fetch_block - checks the given member device to see if its data needs |
2392 | * to be read or computed to satisfy a request. | 2397 | * to be read or computed to satisfy a request. |
2393 | * | 2398 | * |
2394 | * Returns 1 when no more member devices need to be checked, otherwise returns | 2399 | * Returns 1 when no more member devices need to be checked, otherwise returns |
2395 | * 0 to tell the loop in handle_stripe_fill6 to continue | 2400 | * 0 to tell the loop in handle_stripe_fill to continue |
2396 | */ | 2401 | */ |
2397 | static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s, | 2402 | static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, |
2398 | struct r6_state *r6s, int disk_idx, int disks) | 2403 | int disk_idx, int disks) |
2399 | { | 2404 | { |
2400 | struct r5dev *dev = &sh->dev[disk_idx]; | 2405 | struct r5dev *dev = &sh->dev[disk_idx]; |
2401 | struct r5dev *fdev[2] = { &sh->dev[r6s->failed_num[0]], | 2406 | struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]], |
2402 | &sh->dev[r6s->failed_num[1]] }; | 2407 | &sh->dev[s->failed_num[1]] }; |
2403 | 2408 | ||
2409 | /* is the data in this block needed, and can we get it? */ | ||
2404 | if (!test_bit(R5_LOCKED, &dev->flags) && | 2410 | if (!test_bit(R5_LOCKED, &dev->flags) && |
2405 | !test_bit(R5_UPTODATE, &dev->flags) && | 2411 | !test_bit(R5_UPTODATE, &dev->flags) && |
2406 | (dev->toread || | 2412 | (dev->toread || |
2407 | (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || | 2413 | (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || |
2408 | s->syncing || s->expanding || | 2414 | s->syncing || s->expanding || |
2409 | (s->failed >= 1 && | 2415 | (s->failed >= 1 && fdev[0]->toread) || |
2410 | (fdev[0]->toread || s->to_write)) || | 2416 | (s->failed >= 2 && fdev[1]->toread) || |
2411 | (s->failed >= 2 && | 2417 | (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite && |
2412 | (fdev[1]->toread || s->to_write)))) { | 2418 | !test_bit(R5_OVERWRITE, &fdev[0]->flags)) || |
2419 | (sh->raid_conf->level == 6 && s->failed && s->to_write))) { | ||
2413 | /* we would like to get this block, possibly by computing it, | 2420 | /* we would like to get this block, possibly by computing it, |
2414 | * otherwise read it if the backing disk is insync | 2421 | * otherwise read it if the backing disk is insync |
2415 | */ | 2422 | */ |
2416 | BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); | 2423 | BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); |
2417 | BUG_ON(test_bit(R5_Wantread, &dev->flags)); | 2424 | BUG_ON(test_bit(R5_Wantread, &dev->flags)); |
2418 | if ((s->uptodate == disks - 1) && | 2425 | if ((s->uptodate == disks - 1) && |
2419 | (s->failed && (disk_idx == r6s->failed_num[0] || | 2426 | (s->failed && (disk_idx == s->failed_num[0] || |
2420 | disk_idx == r6s->failed_num[1]))) { | 2427 | disk_idx == s->failed_num[1]))) { |
2421 | /* have disk failed, and we're requested to fetch it; | 2428 | /* have disk failed, and we're requested to fetch it; |
2422 | * do compute it | 2429 | * do compute it |
2423 | */ | 2430 | */ |
@@ -2429,6 +2436,12 @@ static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s, | |||
2429 | sh->ops.target = disk_idx; | 2436 | sh->ops.target = disk_idx; |
2430 | sh->ops.target2 = -1; /* no 2nd target */ | 2437 | sh->ops.target2 = -1; /* no 2nd target */ |
2431 | s->req_compute = 1; | 2438 | s->req_compute = 1; |
2439 | /* Careful: from this point on 'uptodate' is in the eye | ||
2440 | * of raid_run_ops which services 'compute' operations | ||
2441 | * before writes. R5_Wantcompute flags a block that will | ||
2442 | * be R5_UPTODATE by the time it is needed for a | ||
2443 | * subsequent operation. | ||
2444 | */ | ||
2432 | s->uptodate++; | 2445 | s->uptodate++; |
2433 | return 1; | 2446 | return 1; |
2434 | } else if (s->uptodate == disks-2 && s->failed >= 2) { | 2447 | } else if (s->uptodate == disks-2 && s->failed >= 2) { |
@@ -2469,11 +2482,11 @@ static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s, | |||
2469 | } | 2482 | } |
2470 | 2483 | ||
2471 | /** | 2484 | /** |
2472 | * handle_stripe_fill6 - read or compute data to satisfy pending requests. | 2485 | * handle_stripe_fill - read or compute data to satisfy pending requests. |
2473 | */ | 2486 | */ |
2474 | static void handle_stripe_fill6(struct stripe_head *sh, | 2487 | static void handle_stripe_fill(struct stripe_head *sh, |
2475 | struct stripe_head_state *s, struct r6_state *r6s, | 2488 | struct stripe_head_state *s, |
2476 | int disks) | 2489 | int disks) |
2477 | { | 2490 | { |
2478 | int i; | 2491 | int i; |
2479 | 2492 | ||
@@ -2484,7 +2497,7 @@ static void handle_stripe_fill6(struct stripe_head *sh, | |||
2484 | if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && | 2497 | if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && |
2485 | !sh->reconstruct_state) | 2498 | !sh->reconstruct_state) |
2486 | for (i = disks; i--; ) | 2499 | for (i = disks; i--; ) |
2487 | if (fetch_block6(sh, s, r6s, i, disks)) | 2500 | if (fetch_block(sh, s, i, disks)) |
2488 | break; | 2501 | break; |
2489 | set_bit(STRIPE_HANDLE, &sh->state); | 2502 | set_bit(STRIPE_HANDLE, &sh->state); |
2490 | } | 2503 | } |
@@ -2540,11 +2553,19 @@ static void handle_stripe_clean_event(raid5_conf_t *conf, | |||
2540 | md_wakeup_thread(conf->mddev->thread); | 2553 | md_wakeup_thread(conf->mddev->thread); |
2541 | } | 2554 | } |
2542 | 2555 | ||
2543 | static void handle_stripe_dirtying5(raid5_conf_t *conf, | 2556 | static void handle_stripe_dirtying(raid5_conf_t *conf, |
2544 | struct stripe_head *sh, struct stripe_head_state *s, int disks) | 2557 | struct stripe_head *sh, |
2558 | struct stripe_head_state *s, | ||
2559 | int disks) | ||
2545 | { | 2560 | { |
2546 | int rmw = 0, rcw = 0, i; | 2561 | int rmw = 0, rcw = 0, i; |
2547 | for (i = disks; i--; ) { | 2562 | if (conf->max_degraded == 2) { |
2563 | /* RAID6 requires 'rcw' in current implementation | ||
2564 | * Calculate the real rcw later - for now fake it | ||
2565 | * look like rcw is cheaper | ||
2566 | */ | ||
2567 | rcw = 1; rmw = 2; | ||
2568 | } else for (i = disks; i--; ) { | ||
2548 | /* would I have to read this buffer for read_modify_write */ | 2569 | /* would I have to read this buffer for read_modify_write */ |
2549 | struct r5dev *dev = &sh->dev[i]; | 2570 | struct r5dev *dev = &sh->dev[i]; |
2550 | if ((dev->towrite || i == sh->pd_idx) && | 2571 | if ((dev->towrite || i == sh->pd_idx) && |
@@ -2591,16 +2612,19 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf, | |||
2591 | } | 2612 | } |
2592 | } | 2613 | } |
2593 | } | 2614 | } |
2594 | if (rcw <= rmw && rcw > 0) | 2615 | if (rcw <= rmw && rcw > 0) { |
2595 | /* want reconstruct write, but need to get some data */ | 2616 | /* want reconstruct write, but need to get some data */ |
2617 | rcw = 0; | ||
2596 | for (i = disks; i--; ) { | 2618 | for (i = disks; i--; ) { |
2597 | struct r5dev *dev = &sh->dev[i]; | 2619 | struct r5dev *dev = &sh->dev[i]; |
2598 | if (!test_bit(R5_OVERWRITE, &dev->flags) && | 2620 | if (!test_bit(R5_OVERWRITE, &dev->flags) && |
2599 | i != sh->pd_idx && | 2621 | i != sh->pd_idx && i != sh->qd_idx && |
2600 | !test_bit(R5_LOCKED, &dev->flags) && | 2622 | !test_bit(R5_LOCKED, &dev->flags) && |
2601 | !(test_bit(R5_UPTODATE, &dev->flags) || | 2623 | !(test_bit(R5_UPTODATE, &dev->flags) || |
2602 | test_bit(R5_Wantcompute, &dev->flags)) && | 2624 | test_bit(R5_Wantcompute, &dev->flags))) { |
2603 | test_bit(R5_Insync, &dev->flags)) { | 2625 | rcw++; |
2626 | if (!test_bit(R5_Insync, &dev->flags)) | ||
2627 | continue; /* it's a failed drive */ | ||
2604 | if ( | 2628 | if ( |
2605 | test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { | 2629 | test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { |
2606 | pr_debug("Read_old block " | 2630 | pr_debug("Read_old block " |
@@ -2614,6 +2638,7 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf, | |||
2614 | } | 2638 | } |
2615 | } | 2639 | } |
2616 | } | 2640 | } |
2641 | } | ||
2617 | /* now if nothing is locked, and if we have enough data, | 2642 | /* now if nothing is locked, and if we have enough data, |
2618 | * we can start a write request | 2643 | * we can start a write request |
2619 | */ | 2644 | */ |
@@ -2630,53 +2655,6 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf, | |||
2630 | schedule_reconstruction(sh, s, rcw == 0, 0); | 2655 | schedule_reconstruction(sh, s, rcw == 0, 0); |
2631 | } | 2656 | } |
2632 | 2657 | ||
2633 | static void handle_stripe_dirtying6(raid5_conf_t *conf, | ||
2634 | struct stripe_head *sh, struct stripe_head_state *s, | ||
2635 | struct r6_state *r6s, int disks) | ||
2636 | { | ||
2637 | int rcw = 0, pd_idx = sh->pd_idx, i; | ||
2638 | int qd_idx = sh->qd_idx; | ||
2639 | |||
2640 | set_bit(STRIPE_HANDLE, &sh->state); | ||
2641 | for (i = disks; i--; ) { | ||
2642 | struct r5dev *dev = &sh->dev[i]; | ||
2643 | /* check if we haven't enough data */ | ||
2644 | if (!test_bit(R5_OVERWRITE, &dev->flags) && | ||
2645 | i != pd_idx && i != qd_idx && | ||
2646 | !test_bit(R5_LOCKED, &dev->flags) && | ||
2647 | !(test_bit(R5_UPTODATE, &dev->flags) || | ||
2648 | test_bit(R5_Wantcompute, &dev->flags))) { | ||
2649 | rcw++; | ||
2650 | if (!test_bit(R5_Insync, &dev->flags)) | ||
2651 | continue; /* it's a failed drive */ | ||
2652 | |||
2653 | if ( | ||
2654 | test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { | ||
2655 | pr_debug("Read_old stripe %llu " | ||
2656 | "block %d for Reconstruct\n", | ||
2657 | (unsigned long long)sh->sector, i); | ||
2658 | set_bit(R5_LOCKED, &dev->flags); | ||
2659 | set_bit(R5_Wantread, &dev->flags); | ||
2660 | s->locked++; | ||
2661 | } else { | ||
2662 | pr_debug("Request delayed stripe %llu " | ||
2663 | "block %d for Reconstruct\n", | ||
2664 | (unsigned long long)sh->sector, i); | ||
2665 | set_bit(STRIPE_DELAYED, &sh->state); | ||
2666 | set_bit(STRIPE_HANDLE, &sh->state); | ||
2667 | } | ||
2668 | } | ||
2669 | } | ||
2670 | /* now if nothing is locked, and if we have enough data, we can start a | ||
2671 | * write request | ||
2672 | */ | ||
2673 | if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && | ||
2674 | s->locked == 0 && rcw == 0 && | ||
2675 | !test_bit(STRIPE_BIT_DELAY, &sh->state)) { | ||
2676 | schedule_reconstruction(sh, s, 1, 0); | ||
2677 | } | ||
2678 | } | ||
2679 | |||
2680 | static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, | 2658 | static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, |
2681 | struct stripe_head_state *s, int disks) | 2659 | struct stripe_head_state *s, int disks) |
2682 | { | 2660 | { |
@@ -2695,7 +2673,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, | |||
2695 | s->uptodate--; | 2673 | s->uptodate--; |
2696 | break; | 2674 | break; |
2697 | } | 2675 | } |
2698 | dev = &sh->dev[s->failed_num]; | 2676 | dev = &sh->dev[s->failed_num[0]]; |
2699 | /* fall through */ | 2677 | /* fall through */ |
2700 | case check_state_compute_result: | 2678 | case check_state_compute_result: |
2701 | sh->check_state = check_state_idle; | 2679 | sh->check_state = check_state_idle; |
@@ -2767,7 +2745,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, | |||
2767 | 2745 | ||
2768 | static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, | 2746 | static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, |
2769 | struct stripe_head_state *s, | 2747 | struct stripe_head_state *s, |
2770 | struct r6_state *r6s, int disks) | 2748 | int disks) |
2771 | { | 2749 | { |
2772 | int pd_idx = sh->pd_idx; | 2750 | int pd_idx = sh->pd_idx; |
2773 | int qd_idx = sh->qd_idx; | 2751 | int qd_idx = sh->qd_idx; |
@@ -2786,14 +2764,14 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, | |||
2786 | switch (sh->check_state) { | 2764 | switch (sh->check_state) { |
2787 | case check_state_idle: | 2765 | case check_state_idle: |
2788 | /* start a new check operation if there are < 2 failures */ | 2766 | /* start a new check operation if there are < 2 failures */ |
2789 | if (s->failed == r6s->q_failed) { | 2767 | if (s->failed == s->q_failed) { |
2790 | /* The only possible failed device holds Q, so it | 2768 | /* The only possible failed device holds Q, so it |
2791 | * makes sense to check P (If anything else were failed, | 2769 | * makes sense to check P (If anything else were failed, |
2792 | * we would have used P to recreate it). | 2770 | * we would have used P to recreate it). |
2793 | */ | 2771 | */ |
2794 | sh->check_state = check_state_run; | 2772 | sh->check_state = check_state_run; |
2795 | } | 2773 | } |
2796 | if (!r6s->q_failed && s->failed < 2) { | 2774 | if (!s->q_failed && s->failed < 2) { |
2797 | /* Q is not failed, and we didn't use it to generate | 2775 | /* Q is not failed, and we didn't use it to generate |
2798 | * anything, so it makes sense to check it | 2776 | * anything, so it makes sense to check it |
2799 | */ | 2777 | */ |
@@ -2835,13 +2813,13 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, | |||
2835 | */ | 2813 | */ |
2836 | BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ | 2814 | BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ |
2837 | if (s->failed == 2) { | 2815 | if (s->failed == 2) { |
2838 | dev = &sh->dev[r6s->failed_num[1]]; | 2816 | dev = &sh->dev[s->failed_num[1]]; |
2839 | s->locked++; | 2817 | s->locked++; |
2840 | set_bit(R5_LOCKED, &dev->flags); | 2818 | set_bit(R5_LOCKED, &dev->flags); |
2841 | set_bit(R5_Wantwrite, &dev->flags); | 2819 | set_bit(R5_Wantwrite, &dev->flags); |
2842 | } | 2820 | } |
2843 | if (s->failed >= 1) { | 2821 | if (s->failed >= 1) { |
2844 | dev = &sh->dev[r6s->failed_num[0]]; | 2822 | dev = &sh->dev[s->failed_num[0]]; |
2845 | s->locked++; | 2823 | s->locked++; |
2846 | set_bit(R5_LOCKED, &dev->flags); | 2824 | set_bit(R5_LOCKED, &dev->flags); |
2847 | set_bit(R5_Wantwrite, &dev->flags); | 2825 | set_bit(R5_Wantwrite, &dev->flags); |
@@ -2928,8 +2906,7 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, | |||
2928 | } | 2906 | } |
2929 | } | 2907 | } |
2930 | 2908 | ||
2931 | static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, | 2909 | static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh) |
2932 | struct r6_state *r6s) | ||
2933 | { | 2910 | { |
2934 | int i; | 2911 | int i; |
2935 | 2912 | ||
@@ -2971,7 +2948,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, | |||
2971 | set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); | 2948 | set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); |
2972 | for (j = 0; j < conf->raid_disks; j++) | 2949 | for (j = 0; j < conf->raid_disks; j++) |
2973 | if (j != sh2->pd_idx && | 2950 | if (j != sh2->pd_idx && |
2974 | (!r6s || j != sh2->qd_idx) && | 2951 | j != sh2->qd_idx && |
2975 | !test_bit(R5_Expanded, &sh2->dev[j].flags)) | 2952 | !test_bit(R5_Expanded, &sh2->dev[j].flags)) |
2976 | break; | 2953 | break; |
2977 | if (j == conf->raid_disks) { | 2954 | if (j == conf->raid_disks) { |
@@ -3006,43 +2983,35 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, | |||
3006 | * | 2983 | * |
3007 | */ | 2984 | */ |
3008 | 2985 | ||
3009 | static void handle_stripe5(struct stripe_head *sh) | 2986 | static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) |
3010 | { | 2987 | { |
3011 | raid5_conf_t *conf = sh->raid_conf; | 2988 | raid5_conf_t *conf = sh->raid_conf; |
3012 | int disks = sh->disks, i; | 2989 | int disks = sh->disks; |
3013 | struct bio *return_bi = NULL; | ||
3014 | struct stripe_head_state s; | ||
3015 | struct r5dev *dev; | 2990 | struct r5dev *dev; |
3016 | mdk_rdev_t *blocked_rdev = NULL; | 2991 | int i; |
3017 | int prexor; | ||
3018 | int dec_preread_active = 0; | ||
3019 | 2992 | ||
3020 | memset(&s, 0, sizeof(s)); | 2993 | memset(s, 0, sizeof(*s)); |
3021 | pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d " | ||
3022 | "reconstruct:%d\n", (unsigned long long)sh->sector, sh->state, | ||
3023 | atomic_read(&sh->count), sh->pd_idx, sh->check_state, | ||
3024 | sh->reconstruct_state); | ||
3025 | 2994 | ||
3026 | spin_lock(&sh->lock); | 2995 | s->syncing = test_bit(STRIPE_SYNCING, &sh->state); |
3027 | clear_bit(STRIPE_HANDLE, &sh->state); | 2996 | s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); |
3028 | clear_bit(STRIPE_DELAYED, &sh->state); | 2997 | s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); |
3029 | 2998 | s->failed_num[0] = -1; | |
3030 | s.syncing = test_bit(STRIPE_SYNCING, &sh->state); | 2999 | s->failed_num[1] = -1; |
3031 | s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); | ||
3032 | s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); | ||
3033 | 3000 | ||
3034 | /* Now to look around and see what can be done */ | 3001 | /* Now to look around and see what can be done */ |
3035 | rcu_read_lock(); | 3002 | rcu_read_lock(); |
3003 | spin_lock_irq(&conf->device_lock); | ||
3036 | for (i=disks; i--; ) { | 3004 | for (i=disks; i--; ) { |
3037 | mdk_rdev_t *rdev; | 3005 | mdk_rdev_t *rdev; |
3006 | sector_t first_bad; | ||
3007 | int bad_sectors; | ||
3008 | int is_bad = 0; | ||
3038 | 3009 | ||
3039 | dev = &sh->dev[i]; | 3010 | dev = &sh->dev[i]; |
3040 | 3011 | ||
3041 | pr_debug("check %d: state 0x%lx toread %p read %p write %p " | 3012 | pr_debug("check %d: state 0x%lx read %p write %p written %p\n", |
3042 | "written %p\n", i, dev->flags, dev->toread, dev->read, | 3013 | i, dev->flags, dev->toread, dev->towrite, dev->written); |
3043 | dev->towrite, dev->written); | 3014 | /* maybe we can reply to a read |
3044 | |||
3045 | /* maybe we can request a biofill operation | ||
3046 | * | 3015 | * |
3047 | * new wantfill requests are only permitted while | 3016 | * new wantfill requests are only permitted while |
3048 | * ops_complete_biofill is guaranteed to be inactive | 3017 | * ops_complete_biofill is guaranteed to be inactive |
@@ -3052,37 +3021,74 @@ static void handle_stripe5(struct stripe_head *sh) | |||
3052 | set_bit(R5_Wantfill, &dev->flags); | 3021 | set_bit(R5_Wantfill, &dev->flags); |
3053 | 3022 | ||
3054 | /* now count some things */ | 3023 | /* now count some things */ |
3055 | if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; | 3024 | if (test_bit(R5_LOCKED, &dev->flags)) |
3056 | if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; | 3025 | s->locked++; |
3057 | if (test_bit(R5_Wantcompute, &dev->flags)) s.compute++; | 3026 | if (test_bit(R5_UPTODATE, &dev->flags)) |
3027 | s->uptodate++; | ||
3028 | if (test_bit(R5_Wantcompute, &dev->flags)) { | ||
3029 | s->compute++; | ||
3030 | BUG_ON(s->compute > 2); | ||
3031 | } | ||
3058 | 3032 | ||
3059 | if (test_bit(R5_Wantfill, &dev->flags)) | 3033 | if (test_bit(R5_Wantfill, &dev->flags)) |
3060 | s.to_fill++; | 3034 | s->to_fill++; |
3061 | else if (dev->toread) | 3035 | else if (dev->toread) |
3062 | s.to_read++; | 3036 | s->to_read++; |
3063 | if (dev->towrite) { | 3037 | if (dev->towrite) { |
3064 | s.to_write++; | 3038 | s->to_write++; |
3065 | if (!test_bit(R5_OVERWRITE, &dev->flags)) | 3039 | if (!test_bit(R5_OVERWRITE, &dev->flags)) |
3066 | s.non_overwrite++; | 3040 | s->non_overwrite++; |
3067 | } | 3041 | } |
3068 | if (dev->written) | 3042 | if (dev->written) |
3069 | s.written++; | 3043 | s->written++; |
3070 | rdev = rcu_dereference(conf->disks[i].rdev); | 3044 | rdev = rcu_dereference(conf->disks[i].rdev); |
3071 | if (blocked_rdev == NULL && | 3045 | if (rdev) { |
3072 | rdev && unlikely(test_bit(Blocked, &rdev->flags))) { | 3046 | is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, |
3073 | blocked_rdev = rdev; | 3047 | &first_bad, &bad_sectors); |
3074 | atomic_inc(&rdev->nr_pending); | 3048 | if (s->blocked_rdev == NULL |
3049 | && (test_bit(Blocked, &rdev->flags) | ||
3050 | || is_bad < 0)) { | ||
3051 | if (is_bad < 0) | ||
3052 | set_bit(BlockedBadBlocks, | ||
3053 | &rdev->flags); | ||
3054 | s->blocked_rdev = rdev; | ||
3055 | atomic_inc(&rdev->nr_pending); | ||
3056 | } | ||
3075 | } | 3057 | } |
3076 | clear_bit(R5_Insync, &dev->flags); | 3058 | clear_bit(R5_Insync, &dev->flags); |
3077 | if (!rdev) | 3059 | if (!rdev) |
3078 | /* Not in-sync */; | 3060 | /* Not in-sync */; |
3079 | else if (test_bit(In_sync, &rdev->flags)) | 3061 | else if (is_bad) { |
3062 | /* also not in-sync */ | ||
3063 | if (!test_bit(WriteErrorSeen, &rdev->flags)) { | ||
3064 | /* treat as in-sync, but with a read error | ||
3065 | * which we can now try to correct | ||
3066 | */ | ||
3067 | set_bit(R5_Insync, &dev->flags); | ||
3068 | set_bit(R5_ReadError, &dev->flags); | ||
3069 | } | ||
3070 | } else if (test_bit(In_sync, &rdev->flags)) | ||
3080 | set_bit(R5_Insync, &dev->flags); | 3071 | set_bit(R5_Insync, &dev->flags); |
3081 | else { | 3072 | else if (!test_bit(Faulty, &rdev->flags)) { |
3082 | /* could be in-sync depending on recovery/reshape status */ | 3073 | /* in sync if before recovery_offset */ |
3083 | if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) | 3074 | if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) |
3084 | set_bit(R5_Insync, &dev->flags); | 3075 | set_bit(R5_Insync, &dev->flags); |
3085 | } | 3076 | } |
3077 | if (test_bit(R5_WriteError, &dev->flags)) { | ||
3078 | clear_bit(R5_Insync, &dev->flags); | ||
3079 | if (!test_bit(Faulty, &rdev->flags)) { | ||
3080 | s->handle_bad_blocks = 1; | ||
3081 | atomic_inc(&rdev->nr_pending); | ||
3082 | } else | ||
3083 | clear_bit(R5_WriteError, &dev->flags); | ||
3084 | } | ||
3085 | if (test_bit(R5_MadeGood, &dev->flags)) { | ||
3086 | if (!test_bit(Faulty, &rdev->flags)) { | ||
3087 | s->handle_bad_blocks = 1; | ||
3088 | atomic_inc(&rdev->nr_pending); | ||
3089 | } else | ||
3090 | clear_bit(R5_MadeGood, &dev->flags); | ||
3091 | } | ||
3086 | if (!test_bit(R5_Insync, &dev->flags)) { | 3092 | if (!test_bit(R5_Insync, &dev->flags)) { |
3087 | /* The ReadError flag will just be confusing now */ | 3093 | /* The ReadError flag will just be confusing now */ |
3088 | clear_bit(R5_ReadError, &dev->flags); | 3094 | clear_bit(R5_ReadError, &dev->flags); |
@@ -3091,313 +3097,60 @@ static void handle_stripe5(struct stripe_head *sh) | |||
3091 | if (test_bit(R5_ReadError, &dev->flags)) | 3097 | if (test_bit(R5_ReadError, &dev->flags)) |
3092 | clear_bit(R5_Insync, &dev->flags); | 3098 | clear_bit(R5_Insync, &dev->flags); |
3093 | if (!test_bit(R5_Insync, &dev->flags)) { | 3099 | if (!test_bit(R5_Insync, &dev->flags)) { |
3094 | s.failed++; | 3100 | if (s->failed < 2) |
3095 | s.failed_num = i; | 3101 | s->failed_num[s->failed] = i; |
3102 | s->failed++; | ||
3096 | } | 3103 | } |
3097 | } | 3104 | } |
3105 | spin_unlock_irq(&conf->device_lock); | ||
3098 | rcu_read_unlock(); | 3106 | rcu_read_unlock(); |
3099 | |||
3100 | if (unlikely(blocked_rdev)) { | ||
3101 | if (s.syncing || s.expanding || s.expanded || | ||
3102 | s.to_write || s.written) { | ||
3103 | set_bit(STRIPE_HANDLE, &sh->state); | ||
3104 | goto unlock; | ||
3105 | } | ||
3106 | /* There is nothing for the blocked_rdev to block */ | ||
3107 | rdev_dec_pending(blocked_rdev, conf->mddev); | ||
3108 | blocked_rdev = NULL; | ||
3109 | } | ||
3110 | |||
3111 | if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { | ||
3112 | set_bit(STRIPE_OP_BIOFILL, &s.ops_request); | ||
3113 | set_bit(STRIPE_BIOFILL_RUN, &sh->state); | ||
3114 | } | ||
3115 | |||
3116 | pr_debug("locked=%d uptodate=%d to_read=%d" | ||
3117 | " to_write=%d failed=%d failed_num=%d\n", | ||
3118 | s.locked, s.uptodate, s.to_read, s.to_write, | ||
3119 | s.failed, s.failed_num); | ||
3120 | /* check if the array has lost two devices and, if so, some requests might | ||
3121 | * need to be failed | ||
3122 | */ | ||
3123 | if (s.failed > 1 && s.to_read+s.to_write+s.written) | ||
3124 | handle_failed_stripe(conf, sh, &s, disks, &return_bi); | ||
3125 | if (s.failed > 1 && s.syncing) { | ||
3126 | md_done_sync(conf->mddev, STRIPE_SECTORS,0); | ||
3127 | clear_bit(STRIPE_SYNCING, &sh->state); | ||
3128 | s.syncing = 0; | ||
3129 | } | ||
3130 | |||
3131 | /* might be able to return some write requests if the parity block | ||
3132 | * is safe, or on a failed drive | ||
3133 | */ | ||
3134 | dev = &sh->dev[sh->pd_idx]; | ||
3135 | if ( s.written && | ||
3136 | ((test_bit(R5_Insync, &dev->flags) && | ||
3137 | !test_bit(R5_LOCKED, &dev->flags) && | ||
3138 | test_bit(R5_UPTODATE, &dev->flags)) || | ||
3139 | (s.failed == 1 && s.failed_num == sh->pd_idx))) | ||
3140 | handle_stripe_clean_event(conf, sh, disks, &return_bi); | ||
3141 | |||
3142 | /* Now we might consider reading some blocks, either to check/generate | ||
3143 | * parity, or to satisfy requests | ||
3144 | * or to load a block that is being partially written. | ||
3145 | */ | ||
3146 | if (s.to_read || s.non_overwrite || | ||
3147 | (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) | ||
3148 | handle_stripe_fill5(sh, &s, disks); | ||
3149 | |||
3150 | /* Now we check to see if any write operations have recently | ||
3151 | * completed | ||
3152 | */ | ||
3153 | prexor = 0; | ||
3154 | if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) | ||
3155 | prexor = 1; | ||
3156 | if (sh->reconstruct_state == reconstruct_state_drain_result || | ||
3157 | sh->reconstruct_state == reconstruct_state_prexor_drain_result) { | ||
3158 | sh->reconstruct_state = reconstruct_state_idle; | ||
3159 | |||
3160 | /* All the 'written' buffers and the parity block are ready to | ||
3161 | * be written back to disk | ||
3162 | */ | ||
3163 | BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); | ||
3164 | for (i = disks; i--; ) { | ||
3165 | dev = &sh->dev[i]; | ||
3166 | if (test_bit(R5_LOCKED, &dev->flags) && | ||
3167 | (i == sh->pd_idx || dev->written)) { | ||
3168 | pr_debug("Writing block %d\n", i); | ||
3169 | set_bit(R5_Wantwrite, &dev->flags); | ||
3170 | if (prexor) | ||
3171 | continue; | ||
3172 | if (!test_bit(R5_Insync, &dev->flags) || | ||
3173 | (i == sh->pd_idx && s.failed == 0)) | ||
3174 | set_bit(STRIPE_INSYNC, &sh->state); | ||
3175 | } | ||
3176 | } | ||
3177 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) | ||
3178 | dec_preread_active = 1; | ||
3179 | } | ||
3180 | |||
3181 | /* Now to consider new write requests and what else, if anything | ||
3182 | * should be read. We do not handle new writes when: | ||
3183 | * 1/ A 'write' operation (copy+xor) is already in flight. | ||
3184 | * 2/ A 'check' operation is in flight, as it may clobber the parity | ||
3185 | * block. | ||
3186 | */ | ||
3187 | if (s.to_write && !sh->reconstruct_state && !sh->check_state) | ||
3188 | handle_stripe_dirtying5(conf, sh, &s, disks); | ||
3189 | |||
3190 | /* maybe we need to check and possibly fix the parity for this stripe | ||
3191 | * Any reads will already have been scheduled, so we just see if enough | ||
3192 | * data is available. The parity check is held off while parity | ||
3193 | * dependent operations are in flight. | ||
3194 | */ | ||
3195 | if (sh->check_state || | ||
3196 | (s.syncing && s.locked == 0 && | ||
3197 | !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && | ||
3198 | !test_bit(STRIPE_INSYNC, &sh->state))) | ||
3199 | handle_parity_checks5(conf, sh, &s, disks); | ||
3200 | |||
3201 | if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { | ||
3202 | md_done_sync(conf->mddev, STRIPE_SECTORS,1); | ||
3203 | clear_bit(STRIPE_SYNCING, &sh->state); | ||
3204 | } | ||
3205 | |||
3206 | /* If the failed drive is just a ReadError, then we might need to progress | ||
3207 | * the repair/check process | ||
3208 | */ | ||
3209 | if (s.failed == 1 && !conf->mddev->ro && | ||
3210 | test_bit(R5_ReadError, &sh->dev[s.failed_num].flags) | ||
3211 | && !test_bit(R5_LOCKED, &sh->dev[s.failed_num].flags) | ||
3212 | && test_bit(R5_UPTODATE, &sh->dev[s.failed_num].flags) | ||
3213 | ) { | ||
3214 | dev = &sh->dev[s.failed_num]; | ||
3215 | if (!test_bit(R5_ReWrite, &dev->flags)) { | ||
3216 | set_bit(R5_Wantwrite, &dev->flags); | ||
3217 | set_bit(R5_ReWrite, &dev->flags); | ||
3218 | set_bit(R5_LOCKED, &dev->flags); | ||
3219 | s.locked++; | ||
3220 | } else { | ||
3221 | /* let's read it back */ | ||
3222 | set_bit(R5_Wantread, &dev->flags); | ||
3223 | set_bit(R5_LOCKED, &dev->flags); | ||
3224 | s.locked++; | ||
3225 | } | ||
3226 | } | ||
3227 | |||
3228 | /* Finish reconstruct operations initiated by the expansion process */ | ||
3229 | if (sh->reconstruct_state == reconstruct_state_result) { | ||
3230 | struct stripe_head *sh2 | ||
3231 | = get_active_stripe(conf, sh->sector, 1, 1, 1); | ||
3232 | if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { | ||
3233 | /* sh cannot be written until sh2 has been read. | ||
3234 | * so arrange for sh to be delayed a little | ||
3235 | */ | ||
3236 | set_bit(STRIPE_DELAYED, &sh->state); | ||
3237 | set_bit(STRIPE_HANDLE, &sh->state); | ||
3238 | if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, | ||
3239 | &sh2->state)) | ||
3240 | atomic_inc(&conf->preread_active_stripes); | ||
3241 | release_stripe(sh2); | ||
3242 | goto unlock; | ||
3243 | } | ||
3244 | if (sh2) | ||
3245 | release_stripe(sh2); | ||
3246 | |||
3247 | sh->reconstruct_state = reconstruct_state_idle; | ||
3248 | clear_bit(STRIPE_EXPANDING, &sh->state); | ||
3249 | for (i = conf->raid_disks; i--; ) { | ||
3250 | set_bit(R5_Wantwrite, &sh->dev[i].flags); | ||
3251 | set_bit(R5_LOCKED, &sh->dev[i].flags); | ||
3252 | s.locked++; | ||
3253 | } | ||
3254 | } | ||
3255 | |||
3256 | if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && | ||
3257 | !sh->reconstruct_state) { | ||
3258 | /* Need to write out all blocks after computing parity */ | ||
3259 | sh->disks = conf->raid_disks; | ||
3260 | stripe_set_idx(sh->sector, conf, 0, sh); | ||
3261 | schedule_reconstruction(sh, &s, 1, 1); | ||
3262 | } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { | ||
3263 | clear_bit(STRIPE_EXPAND_READY, &sh->state); | ||
3264 | atomic_dec(&conf->reshape_stripes); | ||
3265 | wake_up(&conf->wait_for_overlap); | ||
3266 | md_done_sync(conf->mddev, STRIPE_SECTORS, 1); | ||
3267 | } | ||
3268 | |||
3269 | if (s.expanding && s.locked == 0 && | ||
3270 | !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) | ||
3271 | handle_stripe_expansion(conf, sh, NULL); | ||
3272 | |||
3273 | unlock: | ||
3274 | spin_unlock(&sh->lock); | ||
3275 | |||
3276 | /* wait for this device to become unblocked */ | ||
3277 | if (unlikely(blocked_rdev)) | ||
3278 | md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); | ||
3279 | |||
3280 | if (s.ops_request) | ||
3281 | raid_run_ops(sh, s.ops_request); | ||
3282 | |||
3283 | ops_run_io(sh, &s); | ||
3284 | |||
3285 | if (dec_preread_active) { | ||
3286 | /* We delay this until after ops_run_io so that if make_request | ||
3287 | * is waiting on a flush, it won't continue until the writes | ||
3288 | * have actually been submitted. | ||
3289 | */ | ||
3290 | atomic_dec(&conf->preread_active_stripes); | ||
3291 | if (atomic_read(&conf->preread_active_stripes) < | ||
3292 | IO_THRESHOLD) | ||
3293 | md_wakeup_thread(conf->mddev->thread); | ||
3294 | } | ||
3295 | return_io(return_bi); | ||
3296 | } | 3107 | } |
3297 | 3108 | ||
3298 | static void handle_stripe6(struct stripe_head *sh) | 3109 | static void handle_stripe(struct stripe_head *sh) |
3299 | { | 3110 | { |
3111 | struct stripe_head_state s; | ||
3300 | raid5_conf_t *conf = sh->raid_conf; | 3112 | raid5_conf_t *conf = sh->raid_conf; |
3113 | int i; | ||
3114 | int prexor; | ||
3301 | int disks = sh->disks; | 3115 | int disks = sh->disks; |
3302 | struct bio *return_bi = NULL; | 3116 | struct r5dev *pdev, *qdev; |
3303 | int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx; | 3117 | |
3304 | struct stripe_head_state s; | 3118 | clear_bit(STRIPE_HANDLE, &sh->state); |
3305 | struct r6_state r6s; | 3119 | if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) { |
3306 | struct r5dev *dev, *pdev, *qdev; | 3120 | /* already being handled, ensure it gets handled |
3307 | mdk_rdev_t *blocked_rdev = NULL; | 3121 | * again when current action finishes */ |
3308 | int dec_preread_active = 0; | 3122 | set_bit(STRIPE_HANDLE, &sh->state); |
3123 | return; | ||
3124 | } | ||
3125 | |||
3126 | if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { | ||
3127 | set_bit(STRIPE_SYNCING, &sh->state); | ||
3128 | clear_bit(STRIPE_INSYNC, &sh->state); | ||
3129 | } | ||
3130 | clear_bit(STRIPE_DELAYED, &sh->state); | ||
3309 | 3131 | ||
3310 | pr_debug("handling stripe %llu, state=%#lx cnt=%d, " | 3132 | pr_debug("handling stripe %llu, state=%#lx cnt=%d, " |
3311 | "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", | 3133 | "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", |
3312 | (unsigned long long)sh->sector, sh->state, | 3134 | (unsigned long long)sh->sector, sh->state, |
3313 | atomic_read(&sh->count), pd_idx, qd_idx, | 3135 | atomic_read(&sh->count), sh->pd_idx, sh->qd_idx, |
3314 | sh->check_state, sh->reconstruct_state); | 3136 | sh->check_state, sh->reconstruct_state); |
3315 | memset(&s, 0, sizeof(s)); | ||
3316 | |||
3317 | spin_lock(&sh->lock); | ||
3318 | clear_bit(STRIPE_HANDLE, &sh->state); | ||
3319 | clear_bit(STRIPE_DELAYED, &sh->state); | ||
3320 | |||
3321 | s.syncing = test_bit(STRIPE_SYNCING, &sh->state); | ||
3322 | s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); | ||
3323 | s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); | ||
3324 | /* Now to look around and see what can be done */ | ||
3325 | 3137 | ||
3326 | rcu_read_lock(); | 3138 | analyse_stripe(sh, &s); |
3327 | for (i=disks; i--; ) { | ||
3328 | mdk_rdev_t *rdev; | ||
3329 | dev = &sh->dev[i]; | ||
3330 | 3139 | ||
3331 | pr_debug("check %d: state 0x%lx read %p write %p written %p\n", | 3140 | if (s.handle_bad_blocks) { |
3332 | i, dev->flags, dev->toread, dev->towrite, dev->written); | 3141 | set_bit(STRIPE_HANDLE, &sh->state); |
3333 | /* maybe we can reply to a read | 3142 | goto finish; |
3334 | * | ||
3335 | * new wantfill requests are only permitted while | ||
3336 | * ops_complete_biofill is guaranteed to be inactive | ||
3337 | */ | ||
3338 | if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && | ||
3339 | !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) | ||
3340 | set_bit(R5_Wantfill, &dev->flags); | ||
3341 | |||
3342 | /* now count some things */ | ||
3343 | if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; | ||
3344 | if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; | ||
3345 | if (test_bit(R5_Wantcompute, &dev->flags)) { | ||
3346 | s.compute++; | ||
3347 | BUG_ON(s.compute > 2); | ||
3348 | } | ||
3349 | |||
3350 | if (test_bit(R5_Wantfill, &dev->flags)) { | ||
3351 | s.to_fill++; | ||
3352 | } else if (dev->toread) | ||
3353 | s.to_read++; | ||
3354 | if (dev->towrite) { | ||
3355 | s.to_write++; | ||
3356 | if (!test_bit(R5_OVERWRITE, &dev->flags)) | ||
3357 | s.non_overwrite++; | ||
3358 | } | ||
3359 | if (dev->written) | ||
3360 | s.written++; | ||
3361 | rdev = rcu_dereference(conf->disks[i].rdev); | ||
3362 | if (blocked_rdev == NULL && | ||
3363 | rdev && unlikely(test_bit(Blocked, &rdev->flags))) { | ||
3364 | blocked_rdev = rdev; | ||
3365 | atomic_inc(&rdev->nr_pending); | ||
3366 | } | ||
3367 | clear_bit(R5_Insync, &dev->flags); | ||
3368 | if (!rdev) | ||
3369 | /* Not in-sync */; | ||
3370 | else if (test_bit(In_sync, &rdev->flags)) | ||
3371 | set_bit(R5_Insync, &dev->flags); | ||
3372 | else { | ||
3373 | /* in sync if before recovery_offset */ | ||
3374 | if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) | ||
3375 | set_bit(R5_Insync, &dev->flags); | ||
3376 | } | ||
3377 | if (!test_bit(R5_Insync, &dev->flags)) { | ||
3378 | /* The ReadError flag will just be confusing now */ | ||
3379 | clear_bit(R5_ReadError, &dev->flags); | ||
3380 | clear_bit(R5_ReWrite, &dev->flags); | ||
3381 | } | ||
3382 | if (test_bit(R5_ReadError, &dev->flags)) | ||
3383 | clear_bit(R5_Insync, &dev->flags); | ||
3384 | if (!test_bit(R5_Insync, &dev->flags)) { | ||
3385 | if (s.failed < 2) | ||
3386 | r6s.failed_num[s.failed] = i; | ||
3387 | s.failed++; | ||
3388 | } | ||
3389 | } | 3143 | } |
3390 | rcu_read_unlock(); | ||
3391 | 3144 | ||
3392 | if (unlikely(blocked_rdev)) { | 3145 | if (unlikely(s.blocked_rdev)) { |
3393 | if (s.syncing || s.expanding || s.expanded || | 3146 | if (s.syncing || s.expanding || s.expanded || |
3394 | s.to_write || s.written) { | 3147 | s.to_write || s.written) { |
3395 | set_bit(STRIPE_HANDLE, &sh->state); | 3148 | set_bit(STRIPE_HANDLE, &sh->state); |
3396 | goto unlock; | 3149 | goto finish; |
3397 | } | 3150 | } |
3398 | /* There is nothing for the blocked_rdev to block */ | 3151 | /* There is nothing for the blocked_rdev to block */ |
3399 | rdev_dec_pending(blocked_rdev, conf->mddev); | 3152 | rdev_dec_pending(s.blocked_rdev, conf->mddev); |
3400 | blocked_rdev = NULL; | 3153 | s.blocked_rdev = NULL; |
3401 | } | 3154 | } |
3402 | 3155 | ||
3403 | if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { | 3156 | if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { |
@@ -3408,83 +3161,92 @@ static void handle_stripe6(struct stripe_head *sh) | |||
3408 | pr_debug("locked=%d uptodate=%d to_read=%d" | 3161 | pr_debug("locked=%d uptodate=%d to_read=%d" |
3409 | " to_write=%d failed=%d failed_num=%d,%d\n", | 3162 | " to_write=%d failed=%d failed_num=%d,%d\n", |
3410 | s.locked, s.uptodate, s.to_read, s.to_write, s.failed, | 3163 | s.locked, s.uptodate, s.to_read, s.to_write, s.failed, |
3411 | r6s.failed_num[0], r6s.failed_num[1]); | 3164 | s.failed_num[0], s.failed_num[1]); |
3412 | /* check if the array has lost >2 devices and, if so, some requests | 3165 | /* check if the array has lost more than max_degraded devices and, |
3413 | * might need to be failed | 3166 | * if so, some requests might need to be failed. |
3414 | */ | 3167 | */ |
3415 | if (s.failed > 2 && s.to_read+s.to_write+s.written) | 3168 | if (s.failed > conf->max_degraded) { |
3416 | handle_failed_stripe(conf, sh, &s, disks, &return_bi); | 3169 | sh->check_state = 0; |
3417 | if (s.failed > 2 && s.syncing) { | 3170 | sh->reconstruct_state = 0; |
3418 | md_done_sync(conf->mddev, STRIPE_SECTORS,0); | 3171 | if (s.to_read+s.to_write+s.written) |
3419 | clear_bit(STRIPE_SYNCING, &sh->state); | 3172 | handle_failed_stripe(conf, sh, &s, disks, &s.return_bi); |
3420 | s.syncing = 0; | 3173 | if (s.syncing) |
3174 | handle_failed_sync(conf, sh, &s); | ||
3421 | } | 3175 | } |
3422 | 3176 | ||
3423 | /* | 3177 | /* |
3424 | * might be able to return some write requests if the parity blocks | 3178 | * might be able to return some write requests if the parity blocks |
3425 | * are safe, or on a failed drive | 3179 | * are safe, or on a failed drive |
3426 | */ | 3180 | */ |
3427 | pdev = &sh->dev[pd_idx]; | 3181 | pdev = &sh->dev[sh->pd_idx]; |
3428 | r6s.p_failed = (s.failed >= 1 && r6s.failed_num[0] == pd_idx) | 3182 | s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx) |
3429 | || (s.failed >= 2 && r6s.failed_num[1] == pd_idx); | 3183 | || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx); |
3430 | qdev = &sh->dev[qd_idx]; | 3184 | qdev = &sh->dev[sh->qd_idx]; |
3431 | r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == qd_idx) | 3185 | s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx) |
3432 | || (s.failed >= 2 && r6s.failed_num[1] == qd_idx); | 3186 | || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx) |
3433 | 3187 | || conf->level < 6; | |
3434 | if ( s.written && | 3188 | |
3435 | ( r6s.p_failed || ((test_bit(R5_Insync, &pdev->flags) | 3189 | if (s.written && |
3190 | (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) | ||
3436 | && !test_bit(R5_LOCKED, &pdev->flags) | 3191 | && !test_bit(R5_LOCKED, &pdev->flags) |
3437 | && test_bit(R5_UPTODATE, &pdev->flags)))) && | 3192 | && test_bit(R5_UPTODATE, &pdev->flags)))) && |
3438 | ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags) | 3193 | (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) |
3439 | && !test_bit(R5_LOCKED, &qdev->flags) | 3194 | && !test_bit(R5_LOCKED, &qdev->flags) |
3440 | && test_bit(R5_UPTODATE, &qdev->flags))))) | 3195 | && test_bit(R5_UPTODATE, &qdev->flags))))) |
3441 | handle_stripe_clean_event(conf, sh, disks, &return_bi); | 3196 | handle_stripe_clean_event(conf, sh, disks, &s.return_bi); |
3442 | 3197 | ||
3443 | /* Now we might consider reading some blocks, either to check/generate | 3198 | /* Now we might consider reading some blocks, either to check/generate |
3444 | * parity, or to satisfy requests | 3199 | * parity, or to satisfy requests |
3445 | * or to load a block that is being partially written. | 3200 | * or to load a block that is being partially written. |
3446 | */ | 3201 | */ |
3447 | if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || | 3202 | if (s.to_read || s.non_overwrite |
3448 | (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) | 3203 | || (conf->level == 6 && s.to_write && s.failed) |
3449 | handle_stripe_fill6(sh, &s, &r6s, disks); | 3204 | || (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) |
3205 | handle_stripe_fill(sh, &s, disks); | ||
3450 | 3206 | ||
3451 | /* Now we check to see if any write operations have recently | 3207 | /* Now we check to see if any write operations have recently |
3452 | * completed | 3208 | * completed |
3453 | */ | 3209 | */ |
3454 | if (sh->reconstruct_state == reconstruct_state_drain_result) { | 3210 | prexor = 0; |
3455 | 3211 | if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) | |
3212 | prexor = 1; | ||
3213 | if (sh->reconstruct_state == reconstruct_state_drain_result || | ||
3214 | sh->reconstruct_state == reconstruct_state_prexor_drain_result) { | ||
3456 | sh->reconstruct_state = reconstruct_state_idle; | 3215 | sh->reconstruct_state = reconstruct_state_idle; |
3457 | /* All the 'written' buffers and the parity blocks are ready to | 3216 | |
3217 | /* All the 'written' buffers and the parity block are ready to | ||
3458 | * be written back to disk | 3218 | * be written back to disk |
3459 | */ | 3219 | */ |
3460 | BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); | 3220 | BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); |
3461 | BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags)); | 3221 | BUG_ON(sh->qd_idx >= 0 && |
3222 | !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags)); | ||
3462 | for (i = disks; i--; ) { | 3223 | for (i = disks; i--; ) { |
3463 | dev = &sh->dev[i]; | 3224 | struct r5dev *dev = &sh->dev[i]; |
3464 | if (test_bit(R5_LOCKED, &dev->flags) && | 3225 | if (test_bit(R5_LOCKED, &dev->flags) && |
3465 | (i == sh->pd_idx || i == qd_idx || | 3226 | (i == sh->pd_idx || i == sh->qd_idx || |
3466 | dev->written)) { | 3227 | dev->written)) { |
3467 | pr_debug("Writing block %d\n", i); | 3228 | pr_debug("Writing block %d\n", i); |
3468 | BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); | ||
3469 | set_bit(R5_Wantwrite, &dev->flags); | 3229 | set_bit(R5_Wantwrite, &dev->flags); |
3230 | if (prexor) | ||
3231 | continue; | ||
3470 | if (!test_bit(R5_Insync, &dev->flags) || | 3232 | if (!test_bit(R5_Insync, &dev->flags) || |
3471 | ((i == sh->pd_idx || i == qd_idx) && | 3233 | ((i == sh->pd_idx || i == sh->qd_idx) && |
3472 | s.failed == 0)) | 3234 | s.failed == 0)) |
3473 | set_bit(STRIPE_INSYNC, &sh->state); | 3235 | set_bit(STRIPE_INSYNC, &sh->state); |
3474 | } | 3236 | } |
3475 | } | 3237 | } |
3476 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) | 3238 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) |
3477 | dec_preread_active = 1; | 3239 | s.dec_preread_active = 1; |
3478 | } | 3240 | } |
3479 | 3241 | ||
3480 | /* Now to consider new write requests and what else, if anything | 3242 | /* Now to consider new write requests and what else, if anything |
3481 | * should be read. We do not handle new writes when: | 3243 | * should be read. We do not handle new writes when: |
3482 | * 1/ A 'write' operation (copy+gen_syndrome) is already in flight. | 3244 | * 1/ A 'write' operation (copy+xor) is already in flight. |
3483 | * 2/ A 'check' operation is in flight, as it may clobber the parity | 3245 | * 2/ A 'check' operation is in flight, as it may clobber the parity |
3484 | * block. | 3246 | * block. |
3485 | */ | 3247 | */ |
3486 | if (s.to_write && !sh->reconstruct_state && !sh->check_state) | 3248 | if (s.to_write && !sh->reconstruct_state && !sh->check_state) |
3487 | handle_stripe_dirtying6(conf, sh, &s, &r6s, disks); | 3249 | handle_stripe_dirtying(conf, sh, &s, disks); |
3488 | 3250 | ||
3489 | /* maybe we need to check and possibly fix the parity for this stripe | 3251 | /* maybe we need to check and possibly fix the parity for this stripe |
3490 | * Any reads will already have been scheduled, so we just see if enough | 3252 | * Any reads will already have been scheduled, so we just see if enough |
@@ -3494,20 +3256,24 @@ static void handle_stripe6(struct stripe_head *sh) | |||
3494 | if (sh->check_state || | 3256 | if (sh->check_state || |
3495 | (s.syncing && s.locked == 0 && | 3257 | (s.syncing && s.locked == 0 && |
3496 | !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && | 3258 | !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && |
3497 | !test_bit(STRIPE_INSYNC, &sh->state))) | 3259 | !test_bit(STRIPE_INSYNC, &sh->state))) { |
3498 | handle_parity_checks6(conf, sh, &s, &r6s, disks); | 3260 | if (conf->level == 6) |
3261 | handle_parity_checks6(conf, sh, &s, disks); | ||
3262 | else | ||
3263 | handle_parity_checks5(conf, sh, &s, disks); | ||
3264 | } | ||
3499 | 3265 | ||
3500 | if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { | 3266 | if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { |
3501 | md_done_sync(conf->mddev, STRIPE_SECTORS,1); | 3267 | md_done_sync(conf->mddev, STRIPE_SECTORS, 1); |
3502 | clear_bit(STRIPE_SYNCING, &sh->state); | 3268 | clear_bit(STRIPE_SYNCING, &sh->state); |
3503 | } | 3269 | } |
3504 | 3270 | ||
3505 | /* If the failed drives are just a ReadError, then we might need | 3271 | /* If the failed drives are just a ReadError, then we might need |
3506 | * to progress the repair/check process | 3272 | * to progress the repair/check process |
3507 | */ | 3273 | */ |
3508 | if (s.failed <= 2 && !conf->mddev->ro) | 3274 | if (s.failed <= conf->max_degraded && !conf->mddev->ro) |
3509 | for (i = 0; i < s.failed; i++) { | 3275 | for (i = 0; i < s.failed; i++) { |
3510 | dev = &sh->dev[r6s.failed_num[i]]; | 3276 | struct r5dev *dev = &sh->dev[s.failed_num[i]]; |
3511 | if (test_bit(R5_ReadError, &dev->flags) | 3277 | if (test_bit(R5_ReadError, &dev->flags) |
3512 | && !test_bit(R5_LOCKED, &dev->flags) | 3278 | && !test_bit(R5_LOCKED, &dev->flags) |
3513 | && test_bit(R5_UPTODATE, &dev->flags) | 3279 | && test_bit(R5_UPTODATE, &dev->flags) |
@@ -3526,8 +3292,26 @@ static void handle_stripe6(struct stripe_head *sh) | |||
3526 | } | 3292 | } |
3527 | } | 3293 | } |
3528 | 3294 | ||
3295 | |||
3529 | /* Finish reconstruct operations initiated by the expansion process */ | 3296 | /* Finish reconstruct operations initiated by the expansion process */ |
3530 | if (sh->reconstruct_state == reconstruct_state_result) { | 3297 | if (sh->reconstruct_state == reconstruct_state_result) { |
3298 | struct stripe_head *sh_src | ||
3299 | = get_active_stripe(conf, sh->sector, 1, 1, 1); | ||
3300 | if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) { | ||
3301 | /* sh cannot be written until sh_src has been read. | ||
3302 | * so arrange for sh to be delayed a little | ||
3303 | */ | ||
3304 | set_bit(STRIPE_DELAYED, &sh->state); | ||
3305 | set_bit(STRIPE_HANDLE, &sh->state); | ||
3306 | if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, | ||
3307 | &sh_src->state)) | ||
3308 | atomic_inc(&conf->preread_active_stripes); | ||
3309 | release_stripe(sh_src); | ||
3310 | goto finish; | ||
3311 | } | ||
3312 | if (sh_src) | ||
3313 | release_stripe(sh_src); | ||
3314 | |||
3531 | sh->reconstruct_state = reconstruct_state_idle; | 3315 | sh->reconstruct_state = reconstruct_state_idle; |
3532 | clear_bit(STRIPE_EXPANDING, &sh->state); | 3316 | clear_bit(STRIPE_EXPANDING, &sh->state); |
3533 | for (i = conf->raid_disks; i--; ) { | 3317 | for (i = conf->raid_disks; i--; ) { |
@@ -3539,24 +3323,7 @@ static void handle_stripe6(struct stripe_head *sh) | |||
3539 | 3323 | ||
3540 | if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && | 3324 | if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && |
3541 | !sh->reconstruct_state) { | 3325 | !sh->reconstruct_state) { |
3542 | struct stripe_head *sh2 | 3326 | /* Need to write out all blocks after computing parity */ |
3543 | = get_active_stripe(conf, sh->sector, 1, 1, 1); | ||
3544 | if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { | ||
3545 | /* sh cannot be written until sh2 has been read. | ||
3546 | * so arrange for sh to be delayed a little | ||
3547 | */ | ||
3548 | set_bit(STRIPE_DELAYED, &sh->state); | ||
3549 | set_bit(STRIPE_HANDLE, &sh->state); | ||
3550 | if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, | ||
3551 | &sh2->state)) | ||
3552 | atomic_inc(&conf->preread_active_stripes); | ||
3553 | release_stripe(sh2); | ||
3554 | goto unlock; | ||
3555 | } | ||
3556 | if (sh2) | ||
3557 | release_stripe(sh2); | ||
3558 | |||
3559 | /* Need to write out all blocks after computing P&Q */ | ||
3560 | sh->disks = conf->raid_disks; | 3327 | sh->disks = conf->raid_disks; |
3561 | stripe_set_idx(sh->sector, conf, 0, sh); | 3328 | stripe_set_idx(sh->sector, conf, 0, sh); |
3562 | schedule_reconstruction(sh, &s, 1, 1); | 3329 | schedule_reconstruction(sh, &s, 1, 1); |
@@ -3569,22 +3336,39 @@ static void handle_stripe6(struct stripe_head *sh) | |||
3569 | 3336 | ||
3570 | if (s.expanding && s.locked == 0 && | 3337 | if (s.expanding && s.locked == 0 && |
3571 | !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) | 3338 | !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) |
3572 | handle_stripe_expansion(conf, sh, &r6s); | 3339 | handle_stripe_expansion(conf, sh); |
3573 | |||
3574 | unlock: | ||
3575 | spin_unlock(&sh->lock); | ||
3576 | 3340 | ||
3341 | finish: | ||
3577 | /* wait for this device to become unblocked */ | 3342 | /* wait for this device to become unblocked */ |
3578 | if (unlikely(blocked_rdev)) | 3343 | if (conf->mddev->external && unlikely(s.blocked_rdev)) |
3579 | md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); | 3344 | md_wait_for_blocked_rdev(s.blocked_rdev, conf->mddev); |
3345 | |||
3346 | if (s.handle_bad_blocks) | ||
3347 | for (i = disks; i--; ) { | ||
3348 | mdk_rdev_t *rdev; | ||
3349 | struct r5dev *dev = &sh->dev[i]; | ||
3350 | if (test_and_clear_bit(R5_WriteError, &dev->flags)) { | ||
3351 | /* We own a safe reference to the rdev */ | ||
3352 | rdev = conf->disks[i].rdev; | ||
3353 | if (!rdev_set_badblocks(rdev, sh->sector, | ||
3354 | STRIPE_SECTORS, 0)) | ||
3355 | md_error(conf->mddev, rdev); | ||
3356 | rdev_dec_pending(rdev, conf->mddev); | ||
3357 | } | ||
3358 | if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { | ||
3359 | rdev = conf->disks[i].rdev; | ||
3360 | rdev_clear_badblocks(rdev, sh->sector, | ||
3361 | STRIPE_SECTORS); | ||
3362 | rdev_dec_pending(rdev, conf->mddev); | ||
3363 | } | ||
3364 | } | ||
3580 | 3365 | ||
3581 | if (s.ops_request) | 3366 | if (s.ops_request) |
3582 | raid_run_ops(sh, s.ops_request); | 3367 | raid_run_ops(sh, s.ops_request); |
3583 | 3368 | ||
3584 | ops_run_io(sh, &s); | 3369 | ops_run_io(sh, &s); |
3585 | 3370 | ||
3586 | 3371 | if (s.dec_preread_active) { | |
3587 | if (dec_preread_active) { | ||
3588 | /* We delay this until after ops_run_io so that if make_request | 3372 | /* We delay this until after ops_run_io so that if make_request |
3589 | * is waiting on a flush, it won't continue until the writes | 3373 | * is waiting on a flush, it won't continue until the writes |
3590 | * have actually been submitted. | 3374 | * have actually been submitted. |
@@ -3595,15 +3379,9 @@ static void handle_stripe6(struct stripe_head *sh) | |||
3595 | md_wakeup_thread(conf->mddev->thread); | 3379 | md_wakeup_thread(conf->mddev->thread); |
3596 | } | 3380 | } |
3597 | 3381 | ||
3598 | return_io(return_bi); | 3382 | return_io(s.return_bi); |
3599 | } | ||
3600 | 3383 | ||
3601 | static void handle_stripe(struct stripe_head *sh) | 3384 | clear_bit_unlock(STRIPE_ACTIVE, &sh->state); |
3602 | { | ||
3603 | if (sh->raid_conf->level == 6) | ||
3604 | handle_stripe6(sh); | ||
3605 | else | ||
3606 | handle_stripe5(sh); | ||
3607 | } | 3385 | } |
3608 | 3386 | ||
3609 | static void raid5_activate_delayed(raid5_conf_t *conf) | 3387 | static void raid5_activate_delayed(raid5_conf_t *conf) |
@@ -3833,6 +3611,9 @@ static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio) | |||
3833 | rcu_read_lock(); | 3611 | rcu_read_lock(); |
3834 | rdev = rcu_dereference(conf->disks[dd_idx].rdev); | 3612 | rdev = rcu_dereference(conf->disks[dd_idx].rdev); |
3835 | if (rdev && test_bit(In_sync, &rdev->flags)) { | 3613 | if (rdev && test_bit(In_sync, &rdev->flags)) { |
3614 | sector_t first_bad; | ||
3615 | int bad_sectors; | ||
3616 | |||
3836 | atomic_inc(&rdev->nr_pending); | 3617 | atomic_inc(&rdev->nr_pending); |
3837 | rcu_read_unlock(); | 3618 | rcu_read_unlock(); |
3838 | raid_bio->bi_next = (void*)rdev; | 3619 | raid_bio->bi_next = (void*)rdev; |
@@ -3840,8 +3621,10 @@ static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio) | |||
3840 | align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); | 3621 | align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); |
3841 | align_bi->bi_sector += rdev->data_offset; | 3622 | align_bi->bi_sector += rdev->data_offset; |
3842 | 3623 | ||
3843 | if (!bio_fits_rdev(align_bi)) { | 3624 | if (!bio_fits_rdev(align_bi) || |
3844 | /* too big in some way */ | 3625 | is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9, |
3626 | &first_bad, &bad_sectors)) { | ||
3627 | /* too big in some way, or has a known bad block */ | ||
3845 | bio_put(align_bi); | 3628 | bio_put(align_bi); |
3846 | rdev_dec_pending(rdev, mddev); | 3629 | rdev_dec_pending(rdev, mddev); |
3847 | return 0; | 3630 | return 0; |
@@ -4016,7 +3799,7 @@ static int make_request(mddev_t *mddev, struct bio * bi) | |||
4016 | } | 3799 | } |
4017 | } | 3800 | } |
4018 | 3801 | ||
4019 | if (bio_data_dir(bi) == WRITE && | 3802 | if (rw == WRITE && |
4020 | logical_sector >= mddev->suspend_lo && | 3803 | logical_sector >= mddev->suspend_lo && |
4021 | logical_sector < mddev->suspend_hi) { | 3804 | logical_sector < mddev->suspend_hi) { |
4022 | release_stripe(sh); | 3805 | release_stripe(sh); |
@@ -4034,7 +3817,7 @@ static int make_request(mddev_t *mddev, struct bio * bi) | |||
4034 | } | 3817 | } |
4035 | 3818 | ||
4036 | if (test_bit(STRIPE_EXPANDING, &sh->state) || | 3819 | if (test_bit(STRIPE_EXPANDING, &sh->state) || |
4037 | !add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) { | 3820 | !add_stripe_bio(sh, bi, dd_idx, rw)) { |
4038 | /* Stripe is busy expanding or | 3821 | /* Stripe is busy expanding or |
4039 | * add failed due to overlap. Flush everything | 3822 | * add failed due to overlap. Flush everything |
4040 | * and wait a while | 3823 | * and wait a while |
@@ -4375,10 +4158,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski | |||
4375 | 4158 | ||
4376 | bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); | 4159 | bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); |
4377 | 4160 | ||
4378 | spin_lock(&sh->lock); | 4161 | set_bit(STRIPE_SYNC_REQUESTED, &sh->state); |
4379 | set_bit(STRIPE_SYNCING, &sh->state); | ||
4380 | clear_bit(STRIPE_INSYNC, &sh->state); | ||
4381 | spin_unlock(&sh->lock); | ||
4382 | 4162 | ||
4383 | handle_stripe(sh); | 4163 | handle_stripe(sh); |
4384 | release_stripe(sh); | 4164 | release_stripe(sh); |
@@ -4509,6 +4289,9 @@ static void raid5d(mddev_t *mddev) | |||
4509 | release_stripe(sh); | 4289 | release_stripe(sh); |
4510 | cond_resched(); | 4290 | cond_resched(); |
4511 | 4291 | ||
4292 | if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) | ||
4293 | md_check_recovery(mddev); | ||
4294 | |||
4512 | spin_lock_irq(&conf->device_lock); | 4295 | spin_lock_irq(&conf->device_lock); |
4513 | } | 4296 | } |
4514 | pr_debug("%d stripes handled\n", handled); | 4297 | pr_debug("%d stripes handled\n", handled); |
@@ -5162,8 +4945,7 @@ static int run(mddev_t *mddev) | |||
5162 | 4945 | ||
5163 | return 0; | 4946 | return 0; |
5164 | abort: | 4947 | abort: |
5165 | md_unregister_thread(mddev->thread); | 4948 | md_unregister_thread(&mddev->thread); |
5166 | mddev->thread = NULL; | ||
5167 | if (conf) { | 4949 | if (conf) { |
5168 | print_raid5_conf(conf); | 4950 | print_raid5_conf(conf); |
5169 | free_conf(conf); | 4951 | free_conf(conf); |
@@ -5177,8 +4959,7 @@ static int stop(mddev_t *mddev) | |||
5177 | { | 4959 | { |
5178 | raid5_conf_t *conf = mddev->private; | 4960 | raid5_conf_t *conf = mddev->private; |
5179 | 4961 | ||
5180 | md_unregister_thread(mddev->thread); | 4962 | md_unregister_thread(&mddev->thread); |
5181 | mddev->thread = NULL; | ||
5182 | if (mddev->queue) | 4963 | if (mddev->queue) |
5183 | mddev->queue->backing_dev_info.congested_fn = NULL; | 4964 | mddev->queue->backing_dev_info.congested_fn = NULL; |
5184 | free_conf(conf); | 4965 | free_conf(conf); |
@@ -5313,6 +5094,7 @@ static int raid5_remove_disk(mddev_t *mddev, int number) | |||
5313 | * isn't possible. | 5094 | * isn't possible. |
5314 | */ | 5095 | */ |
5315 | if (!test_bit(Faulty, &rdev->flags) && | 5096 | if (!test_bit(Faulty, &rdev->flags) && |
5097 | mddev->recovery_disabled != conf->recovery_disabled && | ||
5316 | !has_failed(conf) && | 5098 | !has_failed(conf) && |
5317 | number < conf->raid_disks) { | 5099 | number < conf->raid_disks) { |
5318 | err = -EBUSY; | 5100 | err = -EBUSY; |
@@ -5341,6 +5123,9 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
5341 | int first = 0; | 5123 | int first = 0; |
5342 | int last = conf->raid_disks - 1; | 5124 | int last = conf->raid_disks - 1; |
5343 | 5125 | ||
5126 | if (mddev->recovery_disabled == conf->recovery_disabled) | ||
5127 | return -EBUSY; | ||
5128 | |||
5344 | if (has_failed(conf)) | 5129 | if (has_failed(conf)) |
5345 | /* no point adding a device */ | 5130 | /* no point adding a device */ |
5346 | return -EINVAL; | 5131 | return -EINVAL; |
@@ -5519,16 +5304,14 @@ static int raid5_start_reshape(mddev_t *mddev) | |||
5519 | if (rdev->raid_disk < 0 && | 5304 | if (rdev->raid_disk < 0 && |
5520 | !test_bit(Faulty, &rdev->flags)) { | 5305 | !test_bit(Faulty, &rdev->flags)) { |
5521 | if (raid5_add_disk(mddev, rdev) == 0) { | 5306 | if (raid5_add_disk(mddev, rdev) == 0) { |
5522 | char nm[20]; | ||
5523 | if (rdev->raid_disk | 5307 | if (rdev->raid_disk |
5524 | >= conf->previous_raid_disks) { | 5308 | >= conf->previous_raid_disks) { |
5525 | set_bit(In_sync, &rdev->flags); | 5309 | set_bit(In_sync, &rdev->flags); |
5526 | added_devices++; | 5310 | added_devices++; |
5527 | } else | 5311 | } else |
5528 | rdev->recovery_offset = 0; | 5312 | rdev->recovery_offset = 0; |
5529 | sprintf(nm, "rd%d", rdev->raid_disk); | 5313 | |
5530 | if (sysfs_create_link(&mddev->kobj, | 5314 | if (sysfs_link_rdev(mddev, rdev)) |
5531 | &rdev->kobj, nm)) | ||
5532 | /* Failure here is OK */; | 5315 | /* Failure here is OK */; |
5533 | } | 5316 | } |
5534 | } else if (rdev->raid_disk >= conf->previous_raid_disks | 5317 | } else if (rdev->raid_disk >= conf->previous_raid_disks |
@@ -5624,9 +5407,7 @@ static void raid5_finish_reshape(mddev_t *mddev) | |||
5624 | d++) { | 5407 | d++) { |
5625 | mdk_rdev_t *rdev = conf->disks[d].rdev; | 5408 | mdk_rdev_t *rdev = conf->disks[d].rdev; |
5626 | if (rdev && raid5_remove_disk(mddev, d) == 0) { | 5409 | if (rdev && raid5_remove_disk(mddev, d) == 0) { |
5627 | char nm[20]; | 5410 | sysfs_unlink_rdev(mddev, rdev); |
5628 | sprintf(nm, "rd%d", rdev->raid_disk); | ||
5629 | sysfs_remove_link(&mddev->kobj, nm); | ||
5630 | rdev->raid_disk = -1; | 5411 | rdev->raid_disk = -1; |
5631 | } | 5412 | } |
5632 | } | 5413 | } |
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 3ca77a2613b..11b9566184b 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h | |||
@@ -6,11 +6,11 @@ | |||
6 | 6 | ||
7 | /* | 7 | /* |
8 | * | 8 | * |
9 | * Each stripe contains one buffer per disc. Each buffer can be in | 9 | * Each stripe contains one buffer per device. Each buffer can be in |
10 | * one of a number of states stored in "flags". Changes between | 10 | * one of a number of states stored in "flags". Changes between |
11 | * these states happen *almost* exclusively under a per-stripe | 11 | * these states happen *almost* exclusively under the protection of the |
12 | * spinlock. Some very specific changes can happen in bi_end_io, and | 12 | * STRIPE_ACTIVE flag. Some very specific changes can happen in bi_end_io, and |
13 | * these are not protected by the spin lock. | 13 | * these are not protected by STRIPE_ACTIVE. |
14 | * | 14 | * |
15 | * The flag bits that are used to represent these states are: | 15 | * The flag bits that are used to represent these states are: |
16 | * R5_UPTODATE and R5_LOCKED | 16 | * R5_UPTODATE and R5_LOCKED |
@@ -76,12 +76,10 @@ | |||
76 | * block and the cached buffer are successfully written, any buffer on | 76 | * block and the cached buffer are successfully written, any buffer on |
77 | * a written list can be returned with b_end_io. | 77 | * a written list can be returned with b_end_io. |
78 | * | 78 | * |
79 | * The write list and read list both act as fifos. The read list is | 79 | * The write list and read list both act as fifos. The read list, |
80 | * protected by the device_lock. The write and written lists are | 80 | * write list and written list are protected by the device_lock. |
81 | * protected by the stripe lock. The device_lock, which can be | 81 | * The device_lock is only for list manipulations and will only be |
82 | * claimed while the stipe lock is held, is only for list | 82 | * held for a very short time. It can be claimed from interrupts. |
83 | * manipulations and will only be held for a very short time. It can | ||
84 | * be claimed from interrupts. | ||
85 | * | 83 | * |
86 | * | 84 | * |
87 | * Stripes in the stripe cache can be on one of two lists (or on | 85 | * Stripes in the stripe cache can be on one of two lists (or on |
@@ -96,7 +94,6 @@ | |||
96 | * | 94 | * |
97 | * The inactive_list, handle_list and hash bucket lists are all protected by the | 95 | * The inactive_list, handle_list and hash bucket lists are all protected by the |
98 | * device_lock. | 96 | * device_lock. |
99 | * - stripes on the inactive_list never have their stripe_lock held. | ||
100 | * - stripes have a reference counter. If count==0, they are on a list. | 97 | * - stripes have a reference counter. If count==0, they are on a list. |
101 | * - If a stripe might need handling, STRIPE_HANDLE is set. | 98 | * - If a stripe might need handling, STRIPE_HANDLE is set. |
102 | * - When refcount reaches zero, then if STRIPE_HANDLE it is put on | 99 | * - When refcount reaches zero, then if STRIPE_HANDLE it is put on |
@@ -116,10 +113,10 @@ | |||
116 | * attach a request to an active stripe (add_stripe_bh()) | 113 | * attach a request to an active stripe (add_stripe_bh()) |
117 | * lockdev attach-buffer unlockdev | 114 | * lockdev attach-buffer unlockdev |
118 | * handle a stripe (handle_stripe()) | 115 | * handle a stripe (handle_stripe()) |
119 | * lockstripe clrSTRIPE_HANDLE ... | 116 | * setSTRIPE_ACTIVE, clrSTRIPE_HANDLE ... |
120 | * (lockdev check-buffers unlockdev) .. | 117 | * (lockdev check-buffers unlockdev) .. |
121 | * change-state .. | 118 | * change-state .. |
122 | * record io/ops needed unlockstripe schedule io/ops | 119 | * record io/ops needed clearSTRIPE_ACTIVE schedule io/ops |
123 | * release an active stripe (release_stripe()) | 120 | * release an active stripe (release_stripe()) |
124 | * lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev | 121 | * lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev |
125 | * | 122 | * |
@@ -128,8 +125,7 @@ | |||
128 | * on a cached buffer, and plus one if the stripe is undergoing stripe | 125 | * on a cached buffer, and plus one if the stripe is undergoing stripe |
129 | * operations. | 126 | * operations. |
130 | * | 127 | * |
131 | * Stripe operations are performed outside the stripe lock, | 128 | * The stripe operations are: |
132 | * the stripe operations are: | ||
133 | * -copying data between the stripe cache and user application buffers | 129 | * -copying data between the stripe cache and user application buffers |
134 | * -computing blocks to save a disk access, or to recover a missing block | 130 | * -computing blocks to save a disk access, or to recover a missing block |
135 | * -updating the parity on a write operation (reconstruct write and | 131 | * -updating the parity on a write operation (reconstruct write and |
@@ -159,7 +155,8 @@ | |||
159 | */ | 155 | */ |
160 | 156 | ||
161 | /* | 157 | /* |
162 | * Operations state - intermediate states that are visible outside of sh->lock | 158 | * Operations state - intermediate states that are visible outside of |
159 | * STRIPE_ACTIVE. | ||
163 | * In general _idle indicates nothing is running, _run indicates a data | 160 | * In general _idle indicates nothing is running, _run indicates a data |
164 | * processing operation is active, and _result means the data processing result | 161 | * processing operation is active, and _result means the data processing result |
165 | * is stable and can be acted upon. For simple operations like biofill and | 162 | * is stable and can be acted upon. For simple operations like biofill and |
@@ -209,7 +206,6 @@ struct stripe_head { | |||
209 | short ddf_layout;/* use DDF ordering to calculate Q */ | 206 | short ddf_layout;/* use DDF ordering to calculate Q */ |
210 | unsigned long state; /* state flags */ | 207 | unsigned long state; /* state flags */ |
211 | atomic_t count; /* nr of active thread/requests */ | 208 | atomic_t count; /* nr of active thread/requests */ |
212 | spinlock_t lock; | ||
213 | int bm_seq; /* sequence number for bitmap flushes */ | 209 | int bm_seq; /* sequence number for bitmap flushes */ |
214 | int disks; /* disks in stripe */ | 210 | int disks; /* disks in stripe */ |
215 | enum check_states check_state; | 211 | enum check_states check_state; |
@@ -240,19 +236,20 @@ struct stripe_head { | |||
240 | }; | 236 | }; |
241 | 237 | ||
242 | /* stripe_head_state - collects and tracks the dynamic state of a stripe_head | 238 | /* stripe_head_state - collects and tracks the dynamic state of a stripe_head |
243 | * for handle_stripe. It is only valid under spin_lock(sh->lock); | 239 | * for handle_stripe. |
244 | */ | 240 | */ |
245 | struct stripe_head_state { | 241 | struct stripe_head_state { |
246 | int syncing, expanding, expanded; | 242 | int syncing, expanding, expanded; |
247 | int locked, uptodate, to_read, to_write, failed, written; | 243 | int locked, uptodate, to_read, to_write, failed, written; |
248 | int to_fill, compute, req_compute, non_overwrite; | 244 | int to_fill, compute, req_compute, non_overwrite; |
249 | int failed_num; | 245 | int failed_num[2]; |
246 | int p_failed, q_failed; | ||
247 | int dec_preread_active; | ||
250 | unsigned long ops_request; | 248 | unsigned long ops_request; |
251 | }; | ||
252 | 249 | ||
253 | /* r6_state - extra state data only relevant to r6 */ | 250 | struct bio *return_bi; |
254 | struct r6_state { | 251 | mdk_rdev_t *blocked_rdev; |
255 | int p_failed, q_failed, failed_num[2]; | 252 | int handle_bad_blocks; |
256 | }; | 253 | }; |
257 | 254 | ||
258 | /* Flags */ | 255 | /* Flags */ |
@@ -268,14 +265,16 @@ struct r6_state { | |||
268 | #define R5_ReWrite 9 /* have tried to over-write the readerror */ | 265 | #define R5_ReWrite 9 /* have tried to over-write the readerror */ |
269 | 266 | ||
270 | #define R5_Expanded 10 /* This block now has post-expand data */ | 267 | #define R5_Expanded 10 /* This block now has post-expand data */ |
271 | #define R5_Wantcompute 11 /* compute_block in progress treat as | 268 | #define R5_Wantcompute 11 /* compute_block in progress treat as |
272 | * uptodate | 269 | * uptodate |
273 | */ | 270 | */ |
274 | #define R5_Wantfill 12 /* dev->toread contains a bio that needs | 271 | #define R5_Wantfill 12 /* dev->toread contains a bio that needs |
275 | * filling | 272 | * filling |
276 | */ | 273 | */ |
277 | #define R5_Wantdrain 13 /* dev->towrite needs to be drained */ | 274 | #define R5_Wantdrain 13 /* dev->towrite needs to be drained */ |
278 | #define R5_WantFUA 14 /* Write should be FUA */ | 275 | #define R5_WantFUA 14 /* Write should be FUA */ |
276 | #define R5_WriteError 15 /* got a write error - need to record it */ | ||
277 | #define R5_MadeGood 16 /* A bad block has been fixed by writing to it*/ | ||
279 | /* | 278 | /* |
280 | * Write method | 279 | * Write method |
281 | */ | 280 | */ |
@@ -289,21 +288,25 @@ struct r6_state { | |||
289 | /* | 288 | /* |
290 | * Stripe state | 289 | * Stripe state |
291 | */ | 290 | */ |
292 | #define STRIPE_HANDLE 2 | 291 | enum { |
293 | #define STRIPE_SYNCING 3 | 292 | STRIPE_ACTIVE, |
294 | #define STRIPE_INSYNC 4 | 293 | STRIPE_HANDLE, |
295 | #define STRIPE_PREREAD_ACTIVE 5 | 294 | STRIPE_SYNC_REQUESTED, |
296 | #define STRIPE_DELAYED 6 | 295 | STRIPE_SYNCING, |
297 | #define STRIPE_DEGRADED 7 | 296 | STRIPE_INSYNC, |
298 | #define STRIPE_BIT_DELAY 8 | 297 | STRIPE_PREREAD_ACTIVE, |
299 | #define STRIPE_EXPANDING 9 | 298 | STRIPE_DELAYED, |
300 | #define STRIPE_EXPAND_SOURCE 10 | 299 | STRIPE_DEGRADED, |
301 | #define STRIPE_EXPAND_READY 11 | 300 | STRIPE_BIT_DELAY, |
302 | #define STRIPE_IO_STARTED 12 /* do not count towards 'bypass_count' */ | 301 | STRIPE_EXPANDING, |
303 | #define STRIPE_FULL_WRITE 13 /* all blocks are set to be overwritten */ | 302 | STRIPE_EXPAND_SOURCE, |
304 | #define STRIPE_BIOFILL_RUN 14 | 303 | STRIPE_EXPAND_READY, |
305 | #define STRIPE_COMPUTE_RUN 15 | 304 | STRIPE_IO_STARTED, /* do not count towards 'bypass_count' */ |
306 | #define STRIPE_OPS_REQ_PENDING 16 | 305 | STRIPE_FULL_WRITE, /* all blocks are set to be overwritten */ |
306 | STRIPE_BIOFILL_RUN, | ||
307 | STRIPE_COMPUTE_RUN, | ||
308 | STRIPE_OPS_REQ_PENDING, | ||
309 | }; | ||
307 | 310 | ||
308 | /* | 311 | /* |
309 | * Operation request flags | 312 | * Operation request flags |
@@ -336,7 +339,7 @@ struct r6_state { | |||
336 | * PREREAD_ACTIVE. | 339 | * PREREAD_ACTIVE. |
337 | * In stripe_handle, if we find pre-reading is necessary, we do it if | 340 | * In stripe_handle, if we find pre-reading is necessary, we do it if |
338 | * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue. | 341 | * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue. |
339 | * HANDLE gets cleared if stripe_handle leave nothing locked. | 342 | * HANDLE gets cleared if stripe_handle leaves nothing locked. |
340 | */ | 343 | */ |
341 | 344 | ||
342 | 345 | ||
@@ -399,7 +402,7 @@ struct raid5_private_data { | |||
399 | * (fresh device added). | 402 | * (fresh device added). |
400 | * Cleared when a sync completes. | 403 | * Cleared when a sync completes. |
401 | */ | 404 | */ |
402 | 405 | int recovery_disabled; | |
403 | /* per cpu variables */ | 406 | /* per cpu variables */ |
404 | struct raid5_percpu { | 407 | struct raid5_percpu { |
405 | struct page *spare_page; /* Used when checking P/Q in raid6 */ | 408 | struct page *spare_page; /* Used when checking P/Q in raid6 */ |