aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig5
-rw-r--r--drivers/md/bitmap.c137
-rw-r--r--drivers/md/bitmap.h5
-rw-r--r--drivers/md/dm-crypt.c647
-rw-r--r--drivers/md/dm-flakey.c272
-rw-r--r--drivers/md/dm-io.c29
-rw-r--r--drivers/md/dm-ioctl.c89
-rw-r--r--drivers/md/dm-kcopyd.c45
-rw-r--r--drivers/md/dm-log-userspace-base.c3
-rw-r--r--drivers/md/dm-log.c32
-rw-r--r--drivers/md/dm-mpath.c149
-rw-r--r--drivers/md/dm-queue-length.c2
-rw-r--r--drivers/md/dm-raid.c621
-rw-r--r--drivers/md/dm-snap-persistent.c80
-rw-r--r--drivers/md/dm-snap.c84
-rw-r--r--drivers/md/dm-table.c187
-rw-r--r--drivers/md/dm.c75
-rw-r--r--drivers/md/dm.h2
-rw-r--r--drivers/md/linear.c8
-rw-r--r--drivers/md/linear.h2
-rw-r--r--drivers/md/md.c945
-rw-r--r--drivers/md/md.h112
-rw-r--r--drivers/md/multipath.c3
-rw-r--r--drivers/md/raid1.c980
-rw-r--r--drivers/md/raid1.h26
-rw-r--r--drivers/md/raid10.c1209
-rw-r--r--drivers/md/raid10.h21
-rw-r--r--drivers/md/raid5.c1025
-rw-r--r--drivers/md/raid5.h99
29 files changed, 4621 insertions, 2273 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 8420129fc5e..f75a66e7d31 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -241,12 +241,13 @@ config DM_MIRROR
241 needed for live data migration tools such as 'pvmove'. 241 needed for live data migration tools such as 'pvmove'.
242 242
243config DM_RAID 243config DM_RAID
244 tristate "RAID 4/5/6 target (EXPERIMENTAL)" 244 tristate "RAID 1/4/5/6 target (EXPERIMENTAL)"
245 depends on BLK_DEV_DM && EXPERIMENTAL 245 depends on BLK_DEV_DM && EXPERIMENTAL
246 select MD_RAID1
246 select MD_RAID456 247 select MD_RAID456
247 select BLK_DEV_MD 248 select BLK_DEV_MD
248 ---help--- 249 ---help---
249 A dm target that supports RAID4, RAID5 and RAID6 mappings 250 A dm target that supports RAID1, RAID4, RAID5 and RAID6 mappings
250 251
251 A RAID-5 set of N drives with a capacity of C MB per drive provides 252 A RAID-5 set of N drives with a capacity of C MB per drive provides
252 the capacity of C * (N - 1) MB, and protects against a failure 253 the capacity of C * (N - 1) MB, and protects against a failure
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 574b09afedd..0dc6546b77a 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -29,7 +29,6 @@
29#include "md.h" 29#include "md.h"
30#include "bitmap.h" 30#include "bitmap.h"
31 31
32#include <linux/dm-dirty-log.h>
33/* debug macros */ 32/* debug macros */
34 33
35#define DEBUG 0 34#define DEBUG 0
@@ -775,10 +774,8 @@ static inline unsigned long file_page_offset(struct bitmap *bitmap, unsigned lon
775 * 0 or page 1 774 * 0 or page 1
776 */ 775 */
777static inline struct page *filemap_get_page(struct bitmap *bitmap, 776static inline struct page *filemap_get_page(struct bitmap *bitmap,
778 unsigned long chunk) 777 unsigned long chunk)
779{ 778{
780 if (bitmap->filemap == NULL)
781 return NULL;
782 if (file_page_index(bitmap, chunk) >= bitmap->file_pages) 779 if (file_page_index(bitmap, chunk) >= bitmap->file_pages)
783 return NULL; 780 return NULL;
784 return bitmap->filemap[file_page_index(bitmap, chunk) 781 return bitmap->filemap[file_page_index(bitmap, chunk)
@@ -878,28 +875,19 @@ enum bitmap_page_attr {
878static inline void set_page_attr(struct bitmap *bitmap, struct page *page, 875static inline void set_page_attr(struct bitmap *bitmap, struct page *page,
879 enum bitmap_page_attr attr) 876 enum bitmap_page_attr attr)
880{ 877{
881 if (page) 878 __set_bit((page->index<<2) + attr, bitmap->filemap_attr);
882 __set_bit((page->index<<2) + attr, bitmap->filemap_attr);
883 else
884 __set_bit(attr, &bitmap->logattrs);
885} 879}
886 880
887static inline void clear_page_attr(struct bitmap *bitmap, struct page *page, 881static inline void clear_page_attr(struct bitmap *bitmap, struct page *page,
888 enum bitmap_page_attr attr) 882 enum bitmap_page_attr attr)
889{ 883{
890 if (page) 884 __clear_bit((page->index<<2) + attr, bitmap->filemap_attr);
891 __clear_bit((page->index<<2) + attr, bitmap->filemap_attr);
892 else
893 __clear_bit(attr, &bitmap->logattrs);
894} 885}
895 886
896static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *page, 887static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *page,
897 enum bitmap_page_attr attr) 888 enum bitmap_page_attr attr)
898{ 889{
899 if (page) 890 return test_bit((page->index<<2) + attr, bitmap->filemap_attr);
900 return test_bit((page->index<<2) + attr, bitmap->filemap_attr);
901 else
902 return test_bit(attr, &bitmap->logattrs);
903} 891}
904 892
905/* 893/*
@@ -912,30 +900,26 @@ static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *p
912static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) 900static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
913{ 901{
914 unsigned long bit; 902 unsigned long bit;
915 struct page *page = NULL; 903 struct page *page;
916 void *kaddr; 904 void *kaddr;
917 unsigned long chunk = block >> CHUNK_BLOCK_SHIFT(bitmap); 905 unsigned long chunk = block >> CHUNK_BLOCK_SHIFT(bitmap);
918 906
919 if (!bitmap->filemap) { 907 if (!bitmap->filemap)
920 struct dm_dirty_log *log = bitmap->mddev->bitmap_info.log; 908 return;
921 if (log)
922 log->type->mark_region(log, chunk);
923 } else {
924 909
925 page = filemap_get_page(bitmap, chunk); 910 page = filemap_get_page(bitmap, chunk);
926 if (!page) 911 if (!page)
927 return; 912 return;
928 bit = file_page_offset(bitmap, chunk); 913 bit = file_page_offset(bitmap, chunk);
929 914
930 /* set the bit */ 915 /* set the bit */
931 kaddr = kmap_atomic(page, KM_USER0); 916 kaddr = kmap_atomic(page, KM_USER0);
932 if (bitmap->flags & BITMAP_HOSTENDIAN) 917 if (bitmap->flags & BITMAP_HOSTENDIAN)
933 set_bit(bit, kaddr); 918 set_bit(bit, kaddr);
934 else 919 else
935 __test_and_set_bit_le(bit, kaddr); 920 __set_bit_le(bit, kaddr);
936 kunmap_atomic(kaddr, KM_USER0); 921 kunmap_atomic(kaddr, KM_USER0);
937 PRINTK("set file bit %lu page %lu\n", bit, page->index); 922 PRINTK("set file bit %lu page %lu\n", bit, page->index);
938 }
939 /* record page number so it gets flushed to disk when unplug occurs */ 923 /* record page number so it gets flushed to disk when unplug occurs */
940 set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); 924 set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY);
941} 925}
@@ -952,16 +936,6 @@ void bitmap_unplug(struct bitmap *bitmap)
952 936
953 if (!bitmap) 937 if (!bitmap)
954 return; 938 return;
955 if (!bitmap->filemap) {
956 /* Must be using a dirty_log */
957 struct dm_dirty_log *log = bitmap->mddev->bitmap_info.log;
958 dirty = test_and_clear_bit(BITMAP_PAGE_DIRTY, &bitmap->logattrs);
959 need_write = test_and_clear_bit(BITMAP_PAGE_NEEDWRITE, &bitmap->logattrs);
960 if (dirty || need_write)
961 if (log->type->flush(log))
962 bitmap->flags |= BITMAP_WRITE_ERROR;
963 goto out;
964 }
965 939
966 /* look at each page to see if there are any set bits that need to be 940 /* look at each page to see if there are any set bits that need to be
967 * flushed out to disk */ 941 * flushed out to disk */
@@ -990,7 +964,6 @@ void bitmap_unplug(struct bitmap *bitmap)
990 else 964 else
991 md_super_wait(bitmap->mddev); 965 md_super_wait(bitmap->mddev);
992 } 966 }
993out:
994 if (bitmap->flags & BITMAP_WRITE_ERROR) 967 if (bitmap->flags & BITMAP_WRITE_ERROR)
995 bitmap_file_kick(bitmap); 968 bitmap_file_kick(bitmap);
996} 969}
@@ -1199,7 +1172,6 @@ void bitmap_daemon_work(mddev_t *mddev)
1199 struct page *page = NULL, *lastpage = NULL; 1172 struct page *page = NULL, *lastpage = NULL;
1200 sector_t blocks; 1173 sector_t blocks;
1201 void *paddr; 1174 void *paddr;
1202 struct dm_dirty_log *log = mddev->bitmap_info.log;
1203 1175
1204 /* Use a mutex to guard daemon_work against 1176 /* Use a mutex to guard daemon_work against
1205 * bitmap_destroy. 1177 * bitmap_destroy.
@@ -1224,12 +1196,11 @@ void bitmap_daemon_work(mddev_t *mddev)
1224 spin_lock_irqsave(&bitmap->lock, flags); 1196 spin_lock_irqsave(&bitmap->lock, flags);
1225 for (j = 0; j < bitmap->chunks; j++) { 1197 for (j = 0; j < bitmap->chunks; j++) {
1226 bitmap_counter_t *bmc; 1198 bitmap_counter_t *bmc;
1227 if (!bitmap->filemap) { 1199 if (!bitmap->filemap)
1228 if (!log) 1200 /* error or shutdown */
1229 /* error or shutdown */ 1201 break;
1230 break; 1202
1231 } else 1203 page = filemap_get_page(bitmap, j);
1232 page = filemap_get_page(bitmap, j);
1233 1204
1234 if (page != lastpage) { 1205 if (page != lastpage) {
1235 /* skip this page unless it's marked as needing cleaning */ 1206 /* skip this page unless it's marked as needing cleaning */
@@ -1298,17 +1269,16 @@ void bitmap_daemon_work(mddev_t *mddev)
1298 -1); 1269 -1);
1299 1270
1300 /* clear the bit */ 1271 /* clear the bit */
1301 if (page) { 1272 paddr = kmap_atomic(page, KM_USER0);
1302 paddr = kmap_atomic(page, KM_USER0); 1273 if (bitmap->flags & BITMAP_HOSTENDIAN)
1303 if (bitmap->flags & BITMAP_HOSTENDIAN) 1274 clear_bit(file_page_offset(bitmap, j),
1304 clear_bit(file_page_offset(bitmap, j), 1275 paddr);
1305 paddr); 1276 else
1306 else 1277 __clear_bit_le(
1307 __test_and_clear_bit_le(file_page_offset(bitmap, j), 1278 file_page_offset(bitmap,
1308 paddr); 1279 j),
1309 kunmap_atomic(paddr, KM_USER0); 1280 paddr);
1310 } else 1281 kunmap_atomic(paddr, KM_USER0);
1311 log->type->clear_region(log, j);
1312 } 1282 }
1313 } else 1283 } else
1314 j |= PAGE_COUNTER_MASK; 1284 j |= PAGE_COUNTER_MASK;
@@ -1316,16 +1286,12 @@ void bitmap_daemon_work(mddev_t *mddev)
1316 spin_unlock_irqrestore(&bitmap->lock, flags); 1286 spin_unlock_irqrestore(&bitmap->lock, flags);
1317 1287
1318 /* now sync the final page */ 1288 /* now sync the final page */
1319 if (lastpage != NULL || log != NULL) { 1289 if (lastpage != NULL) {
1320 spin_lock_irqsave(&bitmap->lock, flags); 1290 spin_lock_irqsave(&bitmap->lock, flags);
1321 if (test_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE)) { 1291 if (test_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE)) {
1322 clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); 1292 clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
1323 spin_unlock_irqrestore(&bitmap->lock, flags); 1293 spin_unlock_irqrestore(&bitmap->lock, flags);
1324 if (lastpage) 1294 write_page(bitmap, lastpage, 0);
1325 write_page(bitmap, lastpage, 0);
1326 else
1327 if (log->type->flush(log))
1328 bitmap->flags |= BITMAP_WRITE_ERROR;
1329 } else { 1295 } else {
1330 set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); 1296 set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
1331 spin_unlock_irqrestore(&bitmap->lock, flags); 1297 spin_unlock_irqrestore(&bitmap->lock, flags);
@@ -1767,12 +1733,10 @@ int bitmap_create(mddev_t *mddev)
1767 BUILD_BUG_ON(sizeof(bitmap_super_t) != 256); 1733 BUILD_BUG_ON(sizeof(bitmap_super_t) != 256);
1768 1734
1769 if (!file 1735 if (!file
1770 && !mddev->bitmap_info.offset 1736 && !mddev->bitmap_info.offset) /* bitmap disabled, nothing to do */
1771 && !mddev->bitmap_info.log) /* bitmap disabled, nothing to do */
1772 return 0; 1737 return 0;
1773 1738
1774 BUG_ON(file && mddev->bitmap_info.offset); 1739 BUG_ON(file && mddev->bitmap_info.offset);
1775 BUG_ON(mddev->bitmap_info.offset && mddev->bitmap_info.log);
1776 1740
1777 bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); 1741 bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL);
1778 if (!bitmap) 1742 if (!bitmap)
@@ -1863,6 +1827,7 @@ int bitmap_create(mddev_t *mddev)
1863int bitmap_load(mddev_t *mddev) 1827int bitmap_load(mddev_t *mddev)
1864{ 1828{
1865 int err = 0; 1829 int err = 0;
1830 sector_t start = 0;
1866 sector_t sector = 0; 1831 sector_t sector = 0;
1867 struct bitmap *bitmap = mddev->bitmap; 1832 struct bitmap *bitmap = mddev->bitmap;
1868 1833
@@ -1881,24 +1846,14 @@ int bitmap_load(mddev_t *mddev)
1881 } 1846 }
1882 bitmap_close_sync(bitmap); 1847 bitmap_close_sync(bitmap);
1883 1848
1884 if (mddev->bitmap_info.log) { 1849 if (mddev->degraded == 0
1885 unsigned long i; 1850 || bitmap->events_cleared == mddev->events)
1886 struct dm_dirty_log *log = mddev->bitmap_info.log; 1851 /* no need to keep dirty bits to optimise a
1887 for (i = 0; i < bitmap->chunks; i++) 1852 * re-add of a missing device */
1888 if (!log->type->in_sync(log, i, 1)) 1853 start = mddev->recovery_cp;
1889 bitmap_set_memory_bits(bitmap, 1854
1890 (sector_t)i << CHUNK_BLOCK_SHIFT(bitmap), 1855 err = bitmap_init_from_disk(bitmap, start);
1891 1); 1856
1892 } else {
1893 sector_t start = 0;
1894 if (mddev->degraded == 0
1895 || bitmap->events_cleared == mddev->events)
1896 /* no need to keep dirty bits to optimise a
1897 * re-add of a missing device */
1898 start = mddev->recovery_cp;
1899
1900 err = bitmap_init_from_disk(bitmap, start);
1901 }
1902 if (err) 1857 if (err)
1903 goto out; 1858 goto out;
1904 1859
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index b2a127e891a..a28f2e5588c 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -212,10 +212,6 @@ struct bitmap {
212 unsigned long file_pages; /* number of pages in the file */ 212 unsigned long file_pages; /* number of pages in the file */
213 int last_page_size; /* bytes in the last page */ 213 int last_page_size; /* bytes in the last page */
214 214
215 unsigned long logattrs; /* used when filemap_attr doesn't exist
216 * because we are working with a dirty_log
217 */
218
219 unsigned long flags; 215 unsigned long flags;
220 216
221 int allclean; 217 int allclean;
@@ -237,7 +233,6 @@ struct bitmap {
237 wait_queue_head_t behind_wait; 233 wait_queue_head_t behind_wait;
238 234
239 struct sysfs_dirent *sysfs_can_clear; 235 struct sysfs_dirent *sysfs_can_clear;
240
241}; 236};
242 237
243/* the bitmap API */ 238/* the bitmap API */
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index c8827ffd85b..1f1d3423d39 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -18,19 +18,14 @@
18#include <linux/crypto.h> 18#include <linux/crypto.h>
19#include <linux/workqueue.h> 19#include <linux/workqueue.h>
20#include <linux/backing-dev.h> 20#include <linux/backing-dev.h>
21#include <linux/percpu.h>
22#include <asm/atomic.h> 21#include <asm/atomic.h>
23#include <linux/scatterlist.h> 22#include <linux/scatterlist.h>
24#include <asm/page.h> 23#include <asm/page.h>
25#include <asm/unaligned.h> 24#include <asm/unaligned.h>
26#include <crypto/hash.h>
27#include <crypto/md5.h>
28#include <crypto/algapi.h>
29 25
30#include <linux/device-mapper.h> 26#include <linux/device-mapper.h>
31 27
32#define DM_MSG_PREFIX "crypt" 28#define DM_MSG_PREFIX "crypt"
33#define MESG_STR(x) x, sizeof(x)
34 29
35/* 30/*
36 * context holding the current state of a multi-part conversion 31 * context holding the current state of a multi-part conversion
@@ -67,7 +62,6 @@ struct dm_crypt_request {
67 struct convert_context *ctx; 62 struct convert_context *ctx;
68 struct scatterlist sg_in; 63 struct scatterlist sg_in;
69 struct scatterlist sg_out; 64 struct scatterlist sg_out;
70 sector_t iv_sector;
71}; 65};
72 66
73struct crypt_config; 67struct crypt_config;
@@ -78,13 +72,11 @@ struct crypt_iv_operations {
78 void (*dtr)(struct crypt_config *cc); 72 void (*dtr)(struct crypt_config *cc);
79 int (*init)(struct crypt_config *cc); 73 int (*init)(struct crypt_config *cc);
80 int (*wipe)(struct crypt_config *cc); 74 int (*wipe)(struct crypt_config *cc);
81 int (*generator)(struct crypt_config *cc, u8 *iv, 75 int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector);
82 struct dm_crypt_request *dmreq);
83 int (*post)(struct crypt_config *cc, u8 *iv,
84 struct dm_crypt_request *dmreq);
85}; 76};
86 77
87struct iv_essiv_private { 78struct iv_essiv_private {
79 struct crypto_cipher *tfm;
88 struct crypto_hash *hash_tfm; 80 struct crypto_hash *hash_tfm;
89 u8 *salt; 81 u8 *salt;
90}; 82};
@@ -93,32 +85,11 @@ struct iv_benbi_private {
93 int shift; 85 int shift;
94}; 86};
95 87
96#define LMK_SEED_SIZE 64 /* hash + 0 */
97struct iv_lmk_private {
98 struct crypto_shash *hash_tfm;
99 u8 *seed;
100};
101
102/* 88/*
103 * Crypt: maps a linear range of a block device 89 * Crypt: maps a linear range of a block device
104 * and encrypts / decrypts at the same time. 90 * and encrypts / decrypts at the same time.
105 */ 91 */
106enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID }; 92enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID };
107
108/*
109 * Duplicated per-CPU state for cipher.
110 */
111struct crypt_cpu {
112 struct ablkcipher_request *req;
113 /* ESSIV: struct crypto_cipher *essiv_tfm */
114 void *iv_private;
115 struct crypto_ablkcipher *tfms[0];
116};
117
118/*
119 * The fields in here must be read only after initialization,
120 * changing state should be in crypt_cpu.
121 */
122struct crypt_config { 93struct crypt_config {
123 struct dm_dev *dev; 94 struct dm_dev *dev;
124 sector_t start; 95 sector_t start;
@@ -142,19 +113,11 @@ struct crypt_config {
142 union { 113 union {
143 struct iv_essiv_private essiv; 114 struct iv_essiv_private essiv;
144 struct iv_benbi_private benbi; 115 struct iv_benbi_private benbi;
145 struct iv_lmk_private lmk;
146 } iv_gen_private; 116 } iv_gen_private;
147 sector_t iv_offset; 117 sector_t iv_offset;
148 unsigned int iv_size; 118 unsigned int iv_size;
149 119
150 /* 120 /*
151 * Duplicated per cpu state. Access through
152 * per_cpu_ptr() only.
153 */
154 struct crypt_cpu __percpu *cpu;
155 unsigned tfms_count;
156
157 /*
158 * Layout of each crypto request: 121 * Layout of each crypto request:
159 * 122 *
160 * struct ablkcipher_request 123 * struct ablkcipher_request
@@ -168,10 +131,11 @@ struct crypt_config {
168 * correctly aligned. 131 * correctly aligned.
169 */ 132 */
170 unsigned int dmreq_start; 133 unsigned int dmreq_start;
134 struct ablkcipher_request *req;
171 135
136 struct crypto_ablkcipher *tfm;
172 unsigned long flags; 137 unsigned long flags;
173 unsigned int key_size; 138 unsigned int key_size;
174 unsigned int key_parts;
175 u8 key[0]; 139 u8 key[0];
176}; 140};
177 141
@@ -183,20 +147,6 @@ static struct kmem_cache *_crypt_io_pool;
183 147
184static void clone_init(struct dm_crypt_io *, struct bio *); 148static void clone_init(struct dm_crypt_io *, struct bio *);
185static void kcryptd_queue_crypt(struct dm_crypt_io *io); 149static void kcryptd_queue_crypt(struct dm_crypt_io *io);
186static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq);
187
188static struct crypt_cpu *this_crypt_config(struct crypt_config *cc)
189{
190 return this_cpu_ptr(cc->cpu);
191}
192
193/*
194 * Use this to access cipher attributes that are the same for each CPU.
195 */
196static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc)
197{
198 return __this_cpu_ptr(cc->cpu)->tfms[0];
199}
200 150
201/* 151/*
202 * Different IV generation algorithms: 152 * Different IV generation algorithms:
@@ -217,38 +167,23 @@ static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc)
217 * null: the initial vector is always zero. Provides compatibility with 167 * null: the initial vector is always zero. Provides compatibility with
218 * obsolete loop_fish2 devices. Do not use for new devices. 168 * obsolete loop_fish2 devices. Do not use for new devices.
219 * 169 *
220 * lmk: Compatible implementation of the block chaining mode used
221 * by the Loop-AES block device encryption system
222 * designed by Jari Ruusu. See http://loop-aes.sourceforge.net/
223 * It operates on full 512 byte sectors and uses CBC
224 * with an IV derived from the sector number, the data and
225 * optionally extra IV seed.
226 * This means that after decryption the first block
227 * of sector must be tweaked according to decrypted data.
228 * Loop-AES can use three encryption schemes:
229 * version 1: is plain aes-cbc mode
230 * version 2: uses 64 multikey scheme with lmk IV generator
231 * version 3: the same as version 2 with additional IV seed
232 * (it uses 65 keys, last key is used as IV seed)
233 *
234 * plumb: unimplemented, see: 170 * plumb: unimplemented, see:
235 * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454 171 * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454
236 */ 172 */
237 173
238static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, 174static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
239 struct dm_crypt_request *dmreq)
240{ 175{
241 memset(iv, 0, cc->iv_size); 176 memset(iv, 0, cc->iv_size);
242 *(u32 *)iv = cpu_to_le32(dmreq->iv_sector & 0xffffffff); 177 *(u32 *)iv = cpu_to_le32(sector & 0xffffffff);
243 178
244 return 0; 179 return 0;
245} 180}
246 181
247static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv, 182static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv,
248 struct dm_crypt_request *dmreq) 183 sector_t sector)
249{ 184{
250 memset(iv, 0, cc->iv_size); 185 memset(iv, 0, cc->iv_size);
251 *(u64 *)iv = cpu_to_le64(dmreq->iv_sector); 186 *(u64 *)iv = cpu_to_le64(sector);
252 187
253 return 0; 188 return 0;
254} 189}
@@ -259,8 +194,7 @@ static int crypt_iv_essiv_init(struct crypt_config *cc)
259 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; 194 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
260 struct hash_desc desc; 195 struct hash_desc desc;
261 struct scatterlist sg; 196 struct scatterlist sg;
262 struct crypto_cipher *essiv_tfm; 197 int err;
263 int err, cpu;
264 198
265 sg_init_one(&sg, cc->key, cc->key_size); 199 sg_init_one(&sg, cc->key, cc->key_size);
266 desc.tfm = essiv->hash_tfm; 200 desc.tfm = essiv->hash_tfm;
@@ -270,16 +204,8 @@ static int crypt_iv_essiv_init(struct crypt_config *cc)
270 if (err) 204 if (err)
271 return err; 205 return err;
272 206
273 for_each_possible_cpu(cpu) { 207 return crypto_cipher_setkey(essiv->tfm, essiv->salt,
274 essiv_tfm = per_cpu_ptr(cc->cpu, cpu)->iv_private,
275
276 err = crypto_cipher_setkey(essiv_tfm, essiv->salt,
277 crypto_hash_digestsize(essiv->hash_tfm)); 208 crypto_hash_digestsize(essiv->hash_tfm));
278 if (err)
279 return err;
280 }
281
282 return 0;
283} 209}
284 210
285/* Wipe salt and reset key derived from volume key */ 211/* Wipe salt and reset key derived from volume key */
@@ -287,76 +213,24 @@ static int crypt_iv_essiv_wipe(struct crypt_config *cc)
287{ 213{
288 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; 214 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
289 unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm); 215 unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm);
290 struct crypto_cipher *essiv_tfm;
291 int cpu, r, err = 0;
292 216
293 memset(essiv->salt, 0, salt_size); 217 memset(essiv->salt, 0, salt_size);
294 218
295 for_each_possible_cpu(cpu) { 219 return crypto_cipher_setkey(essiv->tfm, essiv->salt, salt_size);
296 essiv_tfm = per_cpu_ptr(cc->cpu, cpu)->iv_private;
297 r = crypto_cipher_setkey(essiv_tfm, essiv->salt, salt_size);
298 if (r)
299 err = r;
300 }
301
302 return err;
303}
304
305/* Set up per cpu cipher state */
306static struct crypto_cipher *setup_essiv_cpu(struct crypt_config *cc,
307 struct dm_target *ti,
308 u8 *salt, unsigned saltsize)
309{
310 struct crypto_cipher *essiv_tfm;
311 int err;
312
313 /* Setup the essiv_tfm with the given salt */
314 essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
315 if (IS_ERR(essiv_tfm)) {
316 ti->error = "Error allocating crypto tfm for ESSIV";
317 return essiv_tfm;
318 }
319
320 if (crypto_cipher_blocksize(essiv_tfm) !=
321 crypto_ablkcipher_ivsize(any_tfm(cc))) {
322 ti->error = "Block size of ESSIV cipher does "
323 "not match IV size of block cipher";
324 crypto_free_cipher(essiv_tfm);
325 return ERR_PTR(-EINVAL);
326 }
327
328 err = crypto_cipher_setkey(essiv_tfm, salt, saltsize);
329 if (err) {
330 ti->error = "Failed to set key for ESSIV cipher";
331 crypto_free_cipher(essiv_tfm);
332 return ERR_PTR(err);
333 }
334
335 return essiv_tfm;
336} 220}
337 221
338static void crypt_iv_essiv_dtr(struct crypt_config *cc) 222static void crypt_iv_essiv_dtr(struct crypt_config *cc)
339{ 223{
340 int cpu;
341 struct crypt_cpu *cpu_cc;
342 struct crypto_cipher *essiv_tfm;
343 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; 224 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
344 225
226 crypto_free_cipher(essiv->tfm);
227 essiv->tfm = NULL;
228
345 crypto_free_hash(essiv->hash_tfm); 229 crypto_free_hash(essiv->hash_tfm);
346 essiv->hash_tfm = NULL; 230 essiv->hash_tfm = NULL;
347 231
348 kzfree(essiv->salt); 232 kzfree(essiv->salt);
349 essiv->salt = NULL; 233 essiv->salt = NULL;
350
351 for_each_possible_cpu(cpu) {
352 cpu_cc = per_cpu_ptr(cc->cpu, cpu);
353 essiv_tfm = cpu_cc->iv_private;
354
355 if (essiv_tfm)
356 crypto_free_cipher(essiv_tfm);
357
358 cpu_cc->iv_private = NULL;
359 }
360} 234}
361 235
362static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, 236static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
@@ -365,7 +239,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
365 struct crypto_cipher *essiv_tfm = NULL; 239 struct crypto_cipher *essiv_tfm = NULL;
366 struct crypto_hash *hash_tfm = NULL; 240 struct crypto_hash *hash_tfm = NULL;
367 u8 *salt = NULL; 241 u8 *salt = NULL;
368 int err, cpu; 242 int err;
369 243
370 if (!opts) { 244 if (!opts) {
371 ti->error = "Digest algorithm missing for ESSIV mode"; 245 ti->error = "Digest algorithm missing for ESSIV mode";
@@ -387,44 +261,48 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
387 goto bad; 261 goto bad;
388 } 262 }
389 263
264 /* Allocate essiv_tfm */
265 essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
266 if (IS_ERR(essiv_tfm)) {
267 ti->error = "Error allocating crypto tfm for ESSIV";
268 err = PTR_ERR(essiv_tfm);
269 goto bad;
270 }
271 if (crypto_cipher_blocksize(essiv_tfm) !=
272 crypto_ablkcipher_ivsize(cc->tfm)) {
273 ti->error = "Block size of ESSIV cipher does "
274 "not match IV size of block cipher";
275 err = -EINVAL;
276 goto bad;
277 }
278
390 cc->iv_gen_private.essiv.salt = salt; 279 cc->iv_gen_private.essiv.salt = salt;
280 cc->iv_gen_private.essiv.tfm = essiv_tfm;
391 cc->iv_gen_private.essiv.hash_tfm = hash_tfm; 281 cc->iv_gen_private.essiv.hash_tfm = hash_tfm;
392 282
393 for_each_possible_cpu(cpu) {
394 essiv_tfm = setup_essiv_cpu(cc, ti, salt,
395 crypto_hash_digestsize(hash_tfm));
396 if (IS_ERR(essiv_tfm)) {
397 crypt_iv_essiv_dtr(cc);
398 return PTR_ERR(essiv_tfm);
399 }
400 per_cpu_ptr(cc->cpu, cpu)->iv_private = essiv_tfm;
401 }
402
403 return 0; 283 return 0;
404 284
405bad: 285bad:
286 if (essiv_tfm && !IS_ERR(essiv_tfm))
287 crypto_free_cipher(essiv_tfm);
406 if (hash_tfm && !IS_ERR(hash_tfm)) 288 if (hash_tfm && !IS_ERR(hash_tfm))
407 crypto_free_hash(hash_tfm); 289 crypto_free_hash(hash_tfm);
408 kfree(salt); 290 kfree(salt);
409 return err; 291 return err;
410} 292}
411 293
412static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, 294static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
413 struct dm_crypt_request *dmreq)
414{ 295{
415 struct crypto_cipher *essiv_tfm = this_crypt_config(cc)->iv_private;
416
417 memset(iv, 0, cc->iv_size); 296 memset(iv, 0, cc->iv_size);
418 *(u64 *)iv = cpu_to_le64(dmreq->iv_sector); 297 *(u64 *)iv = cpu_to_le64(sector);
419 crypto_cipher_encrypt_one(essiv_tfm, iv, iv); 298 crypto_cipher_encrypt_one(cc->iv_gen_private.essiv.tfm, iv, iv);
420
421 return 0; 299 return 0;
422} 300}
423 301
424static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti, 302static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti,
425 const char *opts) 303 const char *opts)
426{ 304{
427 unsigned bs = crypto_ablkcipher_blocksize(any_tfm(cc)); 305 unsigned bs = crypto_ablkcipher_blocksize(cc->tfm);
428 int log = ilog2(bs); 306 int log = ilog2(bs);
429 307
430 /* we need to calculate how far we must shift the sector count 308 /* we need to calculate how far we must shift the sector count
@@ -449,177 +327,25 @@ static void crypt_iv_benbi_dtr(struct crypt_config *cc)
449{ 327{
450} 328}
451 329
452static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, 330static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
453 struct dm_crypt_request *dmreq)
454{ 331{
455 __be64 val; 332 __be64 val;
456 333
457 memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */ 334 memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */
458 335
459 val = cpu_to_be64(((u64)dmreq->iv_sector << cc->iv_gen_private.benbi.shift) + 1); 336 val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi.shift) + 1);
460 put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64))); 337 put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64)));
461 338
462 return 0; 339 return 0;
463} 340}
464 341
465static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv, 342static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
466 struct dm_crypt_request *dmreq)
467{ 343{
468 memset(iv, 0, cc->iv_size); 344 memset(iv, 0, cc->iv_size);
469 345
470 return 0; 346 return 0;
471} 347}
472 348
473static void crypt_iv_lmk_dtr(struct crypt_config *cc)
474{
475 struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
476
477 if (lmk->hash_tfm && !IS_ERR(lmk->hash_tfm))
478 crypto_free_shash(lmk->hash_tfm);
479 lmk->hash_tfm = NULL;
480
481 kzfree(lmk->seed);
482 lmk->seed = NULL;
483}
484
485static int crypt_iv_lmk_ctr(struct crypt_config *cc, struct dm_target *ti,
486 const char *opts)
487{
488 struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
489
490 lmk->hash_tfm = crypto_alloc_shash("md5", 0, 0);
491 if (IS_ERR(lmk->hash_tfm)) {
492 ti->error = "Error initializing LMK hash";
493 return PTR_ERR(lmk->hash_tfm);
494 }
495
496 /* No seed in LMK version 2 */
497 if (cc->key_parts == cc->tfms_count) {
498 lmk->seed = NULL;
499 return 0;
500 }
501
502 lmk->seed = kzalloc(LMK_SEED_SIZE, GFP_KERNEL);
503 if (!lmk->seed) {
504 crypt_iv_lmk_dtr(cc);
505 ti->error = "Error kmallocing seed storage in LMK";
506 return -ENOMEM;
507 }
508
509 return 0;
510}
511
512static int crypt_iv_lmk_init(struct crypt_config *cc)
513{
514 struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
515 int subkey_size = cc->key_size / cc->key_parts;
516
517 /* LMK seed is on the position of LMK_KEYS + 1 key */
518 if (lmk->seed)
519 memcpy(lmk->seed, cc->key + (cc->tfms_count * subkey_size),
520 crypto_shash_digestsize(lmk->hash_tfm));
521
522 return 0;
523}
524
525static int crypt_iv_lmk_wipe(struct crypt_config *cc)
526{
527 struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
528
529 if (lmk->seed)
530 memset(lmk->seed, 0, LMK_SEED_SIZE);
531
532 return 0;
533}
534
535static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv,
536 struct dm_crypt_request *dmreq,
537 u8 *data)
538{
539 struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
540 struct {
541 struct shash_desc desc;
542 char ctx[crypto_shash_descsize(lmk->hash_tfm)];
543 } sdesc;
544 struct md5_state md5state;
545 u32 buf[4];
546 int i, r;
547
548 sdesc.desc.tfm = lmk->hash_tfm;
549 sdesc.desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
550
551 r = crypto_shash_init(&sdesc.desc);
552 if (r)
553 return r;
554
555 if (lmk->seed) {
556 r = crypto_shash_update(&sdesc.desc, lmk->seed, LMK_SEED_SIZE);
557 if (r)
558 return r;
559 }
560
561 /* Sector is always 512B, block size 16, add data of blocks 1-31 */
562 r = crypto_shash_update(&sdesc.desc, data + 16, 16 * 31);
563 if (r)
564 return r;
565
566 /* Sector is cropped to 56 bits here */
567 buf[0] = cpu_to_le32(dmreq->iv_sector & 0xFFFFFFFF);
568 buf[1] = cpu_to_le32((((u64)dmreq->iv_sector >> 32) & 0x00FFFFFF) | 0x80000000);
569 buf[2] = cpu_to_le32(4024);
570 buf[3] = 0;
571 r = crypto_shash_update(&sdesc.desc, (u8 *)buf, sizeof(buf));
572 if (r)
573 return r;
574
575 /* No MD5 padding here */
576 r = crypto_shash_export(&sdesc.desc, &md5state);
577 if (r)
578 return r;
579
580 for (i = 0; i < MD5_HASH_WORDS; i++)
581 __cpu_to_le32s(&md5state.hash[i]);
582 memcpy(iv, &md5state.hash, cc->iv_size);
583
584 return 0;
585}
586
587static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv,
588 struct dm_crypt_request *dmreq)
589{
590 u8 *src;
591 int r = 0;
592
593 if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) {
594 src = kmap_atomic(sg_page(&dmreq->sg_in), KM_USER0);
595 r = crypt_iv_lmk_one(cc, iv, dmreq, src + dmreq->sg_in.offset);
596 kunmap_atomic(src, KM_USER0);
597 } else
598 memset(iv, 0, cc->iv_size);
599
600 return r;
601}
602
603static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv,
604 struct dm_crypt_request *dmreq)
605{
606 u8 *dst;
607 int r;
608
609 if (bio_data_dir(dmreq->ctx->bio_in) == WRITE)
610 return 0;
611
612 dst = kmap_atomic(sg_page(&dmreq->sg_out), KM_USER0);
613 r = crypt_iv_lmk_one(cc, iv, dmreq, dst + dmreq->sg_out.offset);
614
615 /* Tweak the first block of plaintext sector */
616 if (!r)
617 crypto_xor(dst + dmreq->sg_out.offset, iv, cc->iv_size);
618
619 kunmap_atomic(dst, KM_USER0);
620 return r;
621}
622
623static struct crypt_iv_operations crypt_iv_plain_ops = { 349static struct crypt_iv_operations crypt_iv_plain_ops = {
624 .generator = crypt_iv_plain_gen 350 .generator = crypt_iv_plain_gen
625}; 351};
@@ -646,15 +372,6 @@ static struct crypt_iv_operations crypt_iv_null_ops = {
646 .generator = crypt_iv_null_gen 372 .generator = crypt_iv_null_gen
647}; 373};
648 374
649static struct crypt_iv_operations crypt_iv_lmk_ops = {
650 .ctr = crypt_iv_lmk_ctr,
651 .dtr = crypt_iv_lmk_dtr,
652 .init = crypt_iv_lmk_init,
653 .wipe = crypt_iv_lmk_wipe,
654 .generator = crypt_iv_lmk_gen,
655 .post = crypt_iv_lmk_post
656};
657
658static void crypt_convert_init(struct crypt_config *cc, 375static void crypt_convert_init(struct crypt_config *cc,
659 struct convert_context *ctx, 376 struct convert_context *ctx,
660 struct bio *bio_out, struct bio *bio_in, 377 struct bio *bio_out, struct bio *bio_in,
@@ -682,13 +399,6 @@ static struct ablkcipher_request *req_of_dmreq(struct crypt_config *cc,
682 return (struct ablkcipher_request *)((char *)dmreq - cc->dmreq_start); 399 return (struct ablkcipher_request *)((char *)dmreq - cc->dmreq_start);
683} 400}
684 401
685static u8 *iv_of_dmreq(struct crypt_config *cc,
686 struct dm_crypt_request *dmreq)
687{
688 return (u8 *)ALIGN((unsigned long)(dmreq + 1),
689 crypto_ablkcipher_alignmask(any_tfm(cc)) + 1);
690}
691
692static int crypt_convert_block(struct crypt_config *cc, 402static int crypt_convert_block(struct crypt_config *cc,
693 struct convert_context *ctx, 403 struct convert_context *ctx,
694 struct ablkcipher_request *req) 404 struct ablkcipher_request *req)
@@ -700,9 +410,9 @@ static int crypt_convert_block(struct crypt_config *cc,
700 int r = 0; 410 int r = 0;
701 411
702 dmreq = dmreq_of_req(cc, req); 412 dmreq = dmreq_of_req(cc, req);
703 iv = iv_of_dmreq(cc, dmreq); 413 iv = (u8 *)ALIGN((unsigned long)(dmreq + 1),
414 crypto_ablkcipher_alignmask(cc->tfm) + 1);
704 415
705 dmreq->iv_sector = ctx->sector;
706 dmreq->ctx = ctx; 416 dmreq->ctx = ctx;
707 sg_init_table(&dmreq->sg_in, 1); 417 sg_init_table(&dmreq->sg_in, 1);
708 sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT, 418 sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT,
@@ -725,7 +435,7 @@ static int crypt_convert_block(struct crypt_config *cc,
725 } 435 }
726 436
727 if (cc->iv_gen_ops) { 437 if (cc->iv_gen_ops) {
728 r = cc->iv_gen_ops->generator(cc, iv, dmreq); 438 r = cc->iv_gen_ops->generator(cc, iv, ctx->sector);
729 if (r < 0) 439 if (r < 0)
730 return r; 440 return r;
731 } 441 }
@@ -738,28 +448,21 @@ static int crypt_convert_block(struct crypt_config *cc,
738 else 448 else
739 r = crypto_ablkcipher_decrypt(req); 449 r = crypto_ablkcipher_decrypt(req);
740 450
741 if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post)
742 r = cc->iv_gen_ops->post(cc, iv, dmreq);
743
744 return r; 451 return r;
745} 452}
746 453
747static void kcryptd_async_done(struct crypto_async_request *async_req, 454static void kcryptd_async_done(struct crypto_async_request *async_req,
748 int error); 455 int error);
749
750static void crypt_alloc_req(struct crypt_config *cc, 456static void crypt_alloc_req(struct crypt_config *cc,
751 struct convert_context *ctx) 457 struct convert_context *ctx)
752{ 458{
753 struct crypt_cpu *this_cc = this_crypt_config(cc); 459 if (!cc->req)
754 unsigned key_index = ctx->sector & (cc->tfms_count - 1); 460 cc->req = mempool_alloc(cc->req_pool, GFP_NOIO);
755 461 ablkcipher_request_set_tfm(cc->req, cc->tfm);
756 if (!this_cc->req) 462 ablkcipher_request_set_callback(cc->req, CRYPTO_TFM_REQ_MAY_BACKLOG |
757 this_cc->req = mempool_alloc(cc->req_pool, GFP_NOIO); 463 CRYPTO_TFM_REQ_MAY_SLEEP,
758 464 kcryptd_async_done,
759 ablkcipher_request_set_tfm(this_cc->req, this_cc->tfms[key_index]); 465 dmreq_of_req(cc, cc->req));
760 ablkcipher_request_set_callback(this_cc->req,
761 CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
762 kcryptd_async_done, dmreq_of_req(cc, this_cc->req));
763} 466}
764 467
765/* 468/*
@@ -768,7 +471,6 @@ static void crypt_alloc_req(struct crypt_config *cc,
768static int crypt_convert(struct crypt_config *cc, 471static int crypt_convert(struct crypt_config *cc,
769 struct convert_context *ctx) 472 struct convert_context *ctx)
770{ 473{
771 struct crypt_cpu *this_cc = this_crypt_config(cc);
772 int r; 474 int r;
773 475
774 atomic_set(&ctx->pending, 1); 476 atomic_set(&ctx->pending, 1);
@@ -780,7 +482,7 @@ static int crypt_convert(struct crypt_config *cc,
780 482
781 atomic_inc(&ctx->pending); 483 atomic_inc(&ctx->pending);
782 484
783 r = crypt_convert_block(cc, ctx, this_cc->req); 485 r = crypt_convert_block(cc, ctx, cc->req);
784 486
785 switch (r) { 487 switch (r) {
786 /* async */ 488 /* async */
@@ -789,7 +491,7 @@ static int crypt_convert(struct crypt_config *cc,
789 INIT_COMPLETION(ctx->restart); 491 INIT_COMPLETION(ctx->restart);
790 /* fall through*/ 492 /* fall through*/
791 case -EINPROGRESS: 493 case -EINPROGRESS:
792 this_cc->req = NULL; 494 cc->req = NULL;
793 ctx->sector++; 495 ctx->sector++;
794 continue; 496 continue;
795 497
@@ -948,9 +650,6 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
948 * They must be separated as otherwise the final stages could be 650 * They must be separated as otherwise the final stages could be
949 * starved by new requests which can block in the first stages due 651 * starved by new requests which can block in the first stages due
950 * to memory allocation. 652 * to memory allocation.
951 *
952 * The work is done per CPU global for all dm-crypt instances.
953 * They should not depend on each other and do not block.
954 */ 653 */
955static void crypt_endio(struct bio *clone, int error) 654static void crypt_endio(struct bio *clone, int error)
956{ 655{
@@ -991,22 +690,25 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone)
991 clone->bi_destructor = dm_crypt_bio_destructor; 690 clone->bi_destructor = dm_crypt_bio_destructor;
992} 691}
993 692
994static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp) 693static void kcryptd_io_read(struct dm_crypt_io *io)
995{ 694{
996 struct crypt_config *cc = io->target->private; 695 struct crypt_config *cc = io->target->private;
997 struct bio *base_bio = io->base_bio; 696 struct bio *base_bio = io->base_bio;
998 struct bio *clone; 697 struct bio *clone;
999 698
699 crypt_inc_pending(io);
700
1000 /* 701 /*
1001 * The block layer might modify the bvec array, so always 702 * The block layer might modify the bvec array, so always
1002 * copy the required bvecs because we need the original 703 * copy the required bvecs because we need the original
1003 * one in order to decrypt the whole bio data *afterwards*. 704 * one in order to decrypt the whole bio data *afterwards*.
1004 */ 705 */
1005 clone = bio_alloc_bioset(gfp, bio_segments(base_bio), cc->bs); 706 clone = bio_alloc_bioset(GFP_NOIO, bio_segments(base_bio), cc->bs);
1006 if (!clone) 707 if (unlikely(!clone)) {
1007 return 1; 708 io->error = -ENOMEM;
1008 709 crypt_dec_pending(io);
1009 crypt_inc_pending(io); 710 return;
711 }
1010 712
1011 clone_init(io, clone); 713 clone_init(io, clone);
1012 clone->bi_idx = 0; 714 clone->bi_idx = 0;
@@ -1017,7 +719,6 @@ static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
1017 sizeof(struct bio_vec) * clone->bi_vcnt); 719 sizeof(struct bio_vec) * clone->bi_vcnt);
1018 720
1019 generic_make_request(clone); 721 generic_make_request(clone);
1020 return 0;
1021} 722}
1022 723
1023static void kcryptd_io_write(struct dm_crypt_io *io) 724static void kcryptd_io_write(struct dm_crypt_io *io)
@@ -1030,12 +731,9 @@ static void kcryptd_io(struct work_struct *work)
1030{ 731{
1031 struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work); 732 struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work);
1032 733
1033 if (bio_data_dir(io->base_bio) == READ) { 734 if (bio_data_dir(io->base_bio) == READ)
1034 crypt_inc_pending(io); 735 kcryptd_io_read(io);
1035 if (kcryptd_io_read(io, GFP_NOIO)) 736 else
1036 io->error = -ENOMEM;
1037 crypt_dec_pending(io);
1038 } else
1039 kcryptd_io_write(io); 737 kcryptd_io_write(io);
1040} 738}
1041 739
@@ -1202,9 +900,6 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
1202 return; 900 return;
1203 } 901 }
1204 902
1205 if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post)
1206 error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq);
1207
1208 mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool); 903 mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool);
1209 904
1210 if (!atomic_dec_and_test(&ctx->pending)) 905 if (!atomic_dec_and_test(&ctx->pending))
@@ -1275,93 +970,34 @@ static void crypt_encode_key(char *hex, u8 *key, unsigned int size)
1275 } 970 }
1276} 971}
1277 972
1278static void crypt_free_tfms(struct crypt_config *cc, int cpu)
1279{
1280 struct crypt_cpu *cpu_cc = per_cpu_ptr(cc->cpu, cpu);
1281 unsigned i;
1282
1283 for (i = 0; i < cc->tfms_count; i++)
1284 if (cpu_cc->tfms[i] && !IS_ERR(cpu_cc->tfms[i])) {
1285 crypto_free_ablkcipher(cpu_cc->tfms[i]);
1286 cpu_cc->tfms[i] = NULL;
1287 }
1288}
1289
1290static int crypt_alloc_tfms(struct crypt_config *cc, int cpu, char *ciphermode)
1291{
1292 struct crypt_cpu *cpu_cc = per_cpu_ptr(cc->cpu, cpu);
1293 unsigned i;
1294 int err;
1295
1296 for (i = 0; i < cc->tfms_count; i++) {
1297 cpu_cc->tfms[i] = crypto_alloc_ablkcipher(ciphermode, 0, 0);
1298 if (IS_ERR(cpu_cc->tfms[i])) {
1299 err = PTR_ERR(cpu_cc->tfms[i]);
1300 crypt_free_tfms(cc, cpu);
1301 return err;
1302 }
1303 }
1304
1305 return 0;
1306}
1307
1308static int crypt_setkey_allcpus(struct crypt_config *cc)
1309{
1310 unsigned subkey_size = cc->key_size >> ilog2(cc->tfms_count);
1311 int cpu, err = 0, i, r;
1312
1313 for_each_possible_cpu(cpu) {
1314 for (i = 0; i < cc->tfms_count; i++) {
1315 r = crypto_ablkcipher_setkey(per_cpu_ptr(cc->cpu, cpu)->tfms[i],
1316 cc->key + (i * subkey_size), subkey_size);
1317 if (r)
1318 err = r;
1319 }
1320 }
1321
1322 return err;
1323}
1324
1325static int crypt_set_key(struct crypt_config *cc, char *key) 973static int crypt_set_key(struct crypt_config *cc, char *key)
1326{ 974{
1327 int r = -EINVAL;
1328 int key_string_len = strlen(key);
1329
1330 /* The key size may not be changed. */ 975 /* The key size may not be changed. */
1331 if (cc->key_size != (key_string_len >> 1)) 976 if (cc->key_size != (strlen(key) >> 1))
1332 goto out; 977 return -EINVAL;
1333 978
1334 /* Hyphen (which gives a key_size of zero) means there is no key. */ 979 /* Hyphen (which gives a key_size of zero) means there is no key. */
1335 if (!cc->key_size && strcmp(key, "-")) 980 if (!cc->key_size && strcmp(key, "-"))
1336 goto out; 981 return -EINVAL;
1337 982
1338 if (cc->key_size && crypt_decode_key(cc->key, key, cc->key_size) < 0) 983 if (cc->key_size && crypt_decode_key(cc->key, key, cc->key_size) < 0)
1339 goto out; 984 return -EINVAL;
1340 985
1341 set_bit(DM_CRYPT_KEY_VALID, &cc->flags); 986 set_bit(DM_CRYPT_KEY_VALID, &cc->flags);
1342 987
1343 r = crypt_setkey_allcpus(cc); 988 return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
1344
1345out:
1346 /* Hex key string not needed after here, so wipe it. */
1347 memset(key, '0', key_string_len);
1348
1349 return r;
1350} 989}
1351 990
1352static int crypt_wipe_key(struct crypt_config *cc) 991static int crypt_wipe_key(struct crypt_config *cc)
1353{ 992{
1354 clear_bit(DM_CRYPT_KEY_VALID, &cc->flags); 993 clear_bit(DM_CRYPT_KEY_VALID, &cc->flags);
1355 memset(&cc->key, 0, cc->key_size * sizeof(u8)); 994 memset(&cc->key, 0, cc->key_size * sizeof(u8));
1356 995 return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
1357 return crypt_setkey_allcpus(cc);
1358} 996}
1359 997
1360static void crypt_dtr(struct dm_target *ti) 998static void crypt_dtr(struct dm_target *ti)
1361{ 999{
1362 struct crypt_config *cc = ti->private; 1000 struct crypt_config *cc = ti->private;
1363 struct crypt_cpu *cpu_cc;
1364 int cpu;
1365 1001
1366 ti->private = NULL; 1002 ti->private = NULL;
1367 1003
@@ -1373,14 +1009,6 @@ static void crypt_dtr(struct dm_target *ti)
1373 if (cc->crypt_queue) 1009 if (cc->crypt_queue)
1374 destroy_workqueue(cc->crypt_queue); 1010 destroy_workqueue(cc->crypt_queue);
1375 1011
1376 if (cc->cpu)
1377 for_each_possible_cpu(cpu) {
1378 cpu_cc = per_cpu_ptr(cc->cpu, cpu);
1379 if (cpu_cc->req)
1380 mempool_free(cpu_cc->req, cc->req_pool);
1381 crypt_free_tfms(cc, cpu);
1382 }
1383
1384 if (cc->bs) 1012 if (cc->bs)
1385 bioset_free(cc->bs); 1013 bioset_free(cc->bs);
1386 1014
@@ -1394,12 +1022,12 @@ static void crypt_dtr(struct dm_target *ti)
1394 if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) 1022 if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
1395 cc->iv_gen_ops->dtr(cc); 1023 cc->iv_gen_ops->dtr(cc);
1396 1024
1025 if (cc->tfm && !IS_ERR(cc->tfm))
1026 crypto_free_ablkcipher(cc->tfm);
1027
1397 if (cc->dev) 1028 if (cc->dev)
1398 dm_put_device(ti, cc->dev); 1029 dm_put_device(ti, cc->dev);
1399 1030
1400 if (cc->cpu)
1401 free_percpu(cc->cpu);
1402
1403 kzfree(cc->cipher); 1031 kzfree(cc->cipher);
1404 kzfree(cc->cipher_string); 1032 kzfree(cc->cipher_string);
1405 1033
@@ -1411,9 +1039,9 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1411 char *cipher_in, char *key) 1039 char *cipher_in, char *key)
1412{ 1040{
1413 struct crypt_config *cc = ti->private; 1041 struct crypt_config *cc = ti->private;
1414 char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount; 1042 char *tmp, *cipher, *chainmode, *ivmode, *ivopts;
1415 char *cipher_api = NULL; 1043 char *cipher_api = NULL;
1416 int cpu, ret = -EINVAL; 1044 int ret = -EINVAL;
1417 1045
1418 /* Convert to crypto api definition? */ 1046 /* Convert to crypto api definition? */
1419 if (strchr(cipher_in, '(')) { 1047 if (strchr(cipher_in, '(')) {
@@ -1427,20 +1055,10 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1427 1055
1428 /* 1056 /*
1429 * Legacy dm-crypt cipher specification 1057 * Legacy dm-crypt cipher specification
1430 * cipher[:keycount]-mode-iv:ivopts 1058 * cipher-mode-iv:ivopts
1431 */ 1059 */
1432 tmp = cipher_in; 1060 tmp = cipher_in;
1433 keycount = strsep(&tmp, "-"); 1061 cipher = strsep(&tmp, "-");
1434 cipher = strsep(&keycount, ":");
1435
1436 if (!keycount)
1437 cc->tfms_count = 1;
1438 else if (sscanf(keycount, "%u", &cc->tfms_count) != 1 ||
1439 !is_power_of_2(cc->tfms_count)) {
1440 ti->error = "Bad cipher key count specification";
1441 return -EINVAL;
1442 }
1443 cc->key_parts = cc->tfms_count;
1444 1062
1445 cc->cipher = kstrdup(cipher, GFP_KERNEL); 1063 cc->cipher = kstrdup(cipher, GFP_KERNEL);
1446 if (!cc->cipher) 1064 if (!cc->cipher)
@@ -1453,14 +1071,6 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1453 if (tmp) 1071 if (tmp)
1454 DMWARN("Ignoring unexpected additional cipher options"); 1072 DMWARN("Ignoring unexpected additional cipher options");
1455 1073
1456 cc->cpu = __alloc_percpu(sizeof(*(cc->cpu)) +
1457 cc->tfms_count * sizeof(*(cc->cpu->tfms)),
1458 __alignof__(struct crypt_cpu));
1459 if (!cc->cpu) {
1460 ti->error = "Cannot allocate per cpu state";
1461 goto bad_mem;
1462 }
1463
1464 /* 1074 /*
1465 * For compatibility with the original dm-crypt mapping format, if 1075 * For compatibility with the original dm-crypt mapping format, if
1466 * only the cipher name is supplied, use cbc-plain. 1076 * only the cipher name is supplied, use cbc-plain.
@@ -1487,12 +1097,11 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1487 } 1097 }
1488 1098
1489 /* Allocate cipher */ 1099 /* Allocate cipher */
1490 for_each_possible_cpu(cpu) { 1100 cc->tfm = crypto_alloc_ablkcipher(cipher_api, 0, 0);
1491 ret = crypt_alloc_tfms(cc, cpu, cipher_api); 1101 if (IS_ERR(cc->tfm)) {
1492 if (ret < 0) { 1102 ret = PTR_ERR(cc->tfm);
1493 ti->error = "Error allocating crypto tfm"; 1103 ti->error = "Error allocating crypto tfm";
1494 goto bad; 1104 goto bad;
1495 }
1496 } 1105 }
1497 1106
1498 /* Initialize and set key */ 1107 /* Initialize and set key */
@@ -1503,7 +1112,7 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1503 } 1112 }
1504 1113
1505 /* Initialize IV */ 1114 /* Initialize IV */
1506 cc->iv_size = crypto_ablkcipher_ivsize(any_tfm(cc)); 1115 cc->iv_size = crypto_ablkcipher_ivsize(cc->tfm);
1507 if (cc->iv_size) 1116 if (cc->iv_size)
1508 /* at least a 64 bit sector number should fit in our buffer */ 1117 /* at least a 64 bit sector number should fit in our buffer */
1509 cc->iv_size = max(cc->iv_size, 1118 cc->iv_size = max(cc->iv_size,
@@ -1526,15 +1135,7 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1526 cc->iv_gen_ops = &crypt_iv_benbi_ops; 1135 cc->iv_gen_ops = &crypt_iv_benbi_ops;
1527 else if (strcmp(ivmode, "null") == 0) 1136 else if (strcmp(ivmode, "null") == 0)
1528 cc->iv_gen_ops = &crypt_iv_null_ops; 1137 cc->iv_gen_ops = &crypt_iv_null_ops;
1529 else if (strcmp(ivmode, "lmk") == 0) { 1138 else {
1530 cc->iv_gen_ops = &crypt_iv_lmk_ops;
1531 /* Version 2 and 3 is recognised according
1532 * to length of provided multi-key string.
1533 * If present (version 3), last key is used as IV seed.
1534 */
1535 if (cc->key_size % cc->key_parts)
1536 cc->key_parts++;
1537 } else {
1538 ret = -EINVAL; 1139 ret = -EINVAL;
1539 ti->error = "Invalid IV mode"; 1140 ti->error = "Invalid IV mode";
1540 goto bad; 1141 goto bad;
@@ -1575,11 +1176,17 @@ bad_mem:
1575static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) 1176static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1576{ 1177{
1577 struct crypt_config *cc; 1178 struct crypt_config *cc;
1578 unsigned int key_size; 1179 unsigned int key_size, opt_params;
1579 unsigned long long tmpll; 1180 unsigned long long tmpll;
1580 int ret; 1181 int ret;
1182 struct dm_arg_set as;
1183 const char *opt_string;
1184
1185 static struct dm_arg _args[] = {
1186 {0, 1, "Invalid number of feature args"},
1187 };
1581 1188
1582 if (argc != 5) { 1189 if (argc < 5) {
1583 ti->error = "Not enough arguments"; 1190 ti->error = "Not enough arguments";
1584 return -EINVAL; 1191 return -EINVAL;
1585 } 1192 }
@@ -1606,9 +1213,9 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1606 } 1213 }
1607 1214
1608 cc->dmreq_start = sizeof(struct ablkcipher_request); 1215 cc->dmreq_start = sizeof(struct ablkcipher_request);
1609 cc->dmreq_start += crypto_ablkcipher_reqsize(any_tfm(cc)); 1216 cc->dmreq_start += crypto_ablkcipher_reqsize(cc->tfm);
1610 cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment()); 1217 cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment());
1611 cc->dmreq_start += crypto_ablkcipher_alignmask(any_tfm(cc)) & 1218 cc->dmreq_start += crypto_ablkcipher_alignmask(cc->tfm) &
1612 ~(crypto_tfm_ctx_alignment() - 1); 1219 ~(crypto_tfm_ctx_alignment() - 1);
1613 1220
1614 cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start + 1221 cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start +
@@ -1617,6 +1224,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1617 ti->error = "Cannot allocate crypt request mempool"; 1224 ti->error = "Cannot allocate crypt request mempool";
1618 goto bad; 1225 goto bad;
1619 } 1226 }
1227 cc->req = NULL;
1620 1228
1621 cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0); 1229 cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0);
1622 if (!cc->page_pool) { 1230 if (!cc->page_pool) {
@@ -1648,27 +1256,46 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1648 } 1256 }
1649 cc->start = tmpll; 1257 cc->start = tmpll;
1650 1258
1259 argv += 5;
1260 argc -= 5;
1261
1262 /* Optional parameters */
1263 if (argc) {
1264 as.argc = argc;
1265 as.argv = argv;
1266
1267 ret = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
1268 if (ret)
1269 goto bad;
1270
1271 opt_string = dm_shift_arg(&as);
1272
1273 if (opt_params == 1 && opt_string &&
1274 !strcasecmp(opt_string, "allow_discards"))
1275 ti->num_discard_requests = 1;
1276 else if (opt_params) {
1277 ret = -EINVAL;
1278 ti->error = "Invalid feature arguments";
1279 goto bad;
1280 }
1281 }
1282
1651 ret = -ENOMEM; 1283 ret = -ENOMEM;
1652 cc->io_queue = alloc_workqueue("kcryptd_io", 1284 cc->io_queue = create_singlethread_workqueue("kcryptd_io");
1653 WQ_NON_REENTRANT|
1654 WQ_MEM_RECLAIM,
1655 1);
1656 if (!cc->io_queue) { 1285 if (!cc->io_queue) {
1657 ti->error = "Couldn't create kcryptd io queue"; 1286 ti->error = "Couldn't create kcryptd io queue";
1658 goto bad; 1287 goto bad;
1659 } 1288 }
1660 1289
1661 cc->crypt_queue = alloc_workqueue("kcryptd", 1290 cc->crypt_queue = create_singlethread_workqueue("kcryptd");
1662 WQ_NON_REENTRANT|
1663 WQ_CPU_INTENSIVE|
1664 WQ_MEM_RECLAIM,
1665 1);
1666 if (!cc->crypt_queue) { 1291 if (!cc->crypt_queue) {
1667 ti->error = "Couldn't create kcryptd queue"; 1292 ti->error = "Couldn't create kcryptd queue";
1668 goto bad; 1293 goto bad;
1669 } 1294 }
1670 1295
1671 ti->num_flush_requests = 1; 1296 ti->num_flush_requests = 1;
1297 ti->discard_zeroes_data_unsupported = 1;
1298
1672 return 0; 1299 return 0;
1673 1300
1674bad: 1301bad:
@@ -1682,18 +1309,24 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
1682 struct dm_crypt_io *io; 1309 struct dm_crypt_io *io;
1683 struct crypt_config *cc; 1310 struct crypt_config *cc;
1684 1311
1685 if (bio->bi_rw & REQ_FLUSH) { 1312 /*
1313 * If bio is REQ_FLUSH or REQ_DISCARD, just bypass crypt queues.
1314 * - for REQ_FLUSH device-mapper core ensures that no IO is in-flight
1315 * - for REQ_DISCARD caller must use flush if IO ordering matters
1316 */
1317 if (unlikely(bio->bi_rw & (REQ_FLUSH | REQ_DISCARD))) {
1686 cc = ti->private; 1318 cc = ti->private;
1687 bio->bi_bdev = cc->dev->bdev; 1319 bio->bi_bdev = cc->dev->bdev;
1320 if (bio_sectors(bio))
1321 bio->bi_sector = cc->start + dm_target_offset(ti, bio->bi_sector);
1688 return DM_MAPIO_REMAPPED; 1322 return DM_MAPIO_REMAPPED;
1689 } 1323 }
1690 1324
1691 io = crypt_io_alloc(ti, bio, dm_target_offset(ti, bio->bi_sector)); 1325 io = crypt_io_alloc(ti, bio, dm_target_offset(ti, bio->bi_sector));
1692 1326
1693 if (bio_data_dir(io->base_bio) == READ) { 1327 if (bio_data_dir(io->base_bio) == READ)
1694 if (kcryptd_io_read(io, GFP_NOWAIT)) 1328 kcryptd_queue_io(io);
1695 kcryptd_queue_io(io); 1329 else
1696 } else
1697 kcryptd_queue_crypt(io); 1330 kcryptd_queue_crypt(io);
1698 1331
1699 return DM_MAPIO_SUBMITTED; 1332 return DM_MAPIO_SUBMITTED;
@@ -1727,6 +1360,10 @@ static int crypt_status(struct dm_target *ti, status_type_t type,
1727 1360
1728 DMEMIT(" %llu %s %llu", (unsigned long long)cc->iv_offset, 1361 DMEMIT(" %llu %s %llu", (unsigned long long)cc->iv_offset,
1729 cc->dev->name, (unsigned long long)cc->start); 1362 cc->dev->name, (unsigned long long)cc->start);
1363
1364 if (ti->num_discard_requests)
1365 DMEMIT(" 1 allow_discards");
1366
1730 break; 1367 break;
1731 } 1368 }
1732 return 0; 1369 return 0;
@@ -1770,12 +1407,12 @@ static int crypt_message(struct dm_target *ti, unsigned argc, char **argv)
1770 if (argc < 2) 1407 if (argc < 2)
1771 goto error; 1408 goto error;
1772 1409
1773 if (!strnicmp(argv[0], MESG_STR("key"))) { 1410 if (!strcasecmp(argv[0], "key")) {
1774 if (!test_bit(DM_CRYPT_SUSPENDED, &cc->flags)) { 1411 if (!test_bit(DM_CRYPT_SUSPENDED, &cc->flags)) {
1775 DMWARN("not suspended during key manipulation."); 1412 DMWARN("not suspended during key manipulation.");
1776 return -EINVAL; 1413 return -EINVAL;
1777 } 1414 }
1778 if (argc == 3 && !strnicmp(argv[1], MESG_STR("set"))) { 1415 if (argc == 3 && !strcasecmp(argv[1], "set")) {
1779 ret = crypt_set_key(cc, argv[2]); 1416 ret = crypt_set_key(cc, argv[2]);
1780 if (ret) 1417 if (ret)
1781 return ret; 1418 return ret;
@@ -1783,7 +1420,7 @@ static int crypt_message(struct dm_target *ti, unsigned argc, char **argv)
1783 ret = cc->iv_gen_ops->init(cc); 1420 ret = cc->iv_gen_ops->init(cc);
1784 return ret; 1421 return ret;
1785 } 1422 }
1786 if (argc == 2 && !strnicmp(argv[1], MESG_STR("wipe"))) { 1423 if (argc == 2 && !strcasecmp(argv[1], "wipe")) {
1787 if (cc->iv_gen_ops && cc->iv_gen_ops->wipe) { 1424 if (cc->iv_gen_ops && cc->iv_gen_ops->wipe) {
1788 ret = cc->iv_gen_ops->wipe(cc); 1425 ret = cc->iv_gen_ops->wipe(cc);
1789 if (ret) 1426 if (ret)
@@ -1823,7 +1460,7 @@ static int crypt_iterate_devices(struct dm_target *ti,
1823 1460
1824static struct target_type crypt_target = { 1461static struct target_type crypt_target = {
1825 .name = "crypt", 1462 .name = "crypt",
1826 .version = {1, 10, 0}, 1463 .version = {1, 8, 0},
1827 .module = THIS_MODULE, 1464 .module = THIS_MODULE,
1828 .ctr = crypt_ctr, 1465 .ctr = crypt_ctr,
1829 .dtr = crypt_dtr, 1466 .dtr = crypt_dtr,
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index ea790623c30..f84c08029b2 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * Copyright (C) 2003 Sistina Software (UK) Limited. 2 * Copyright (C) 2003 Sistina Software (UK) Limited.
3 * Copyright (C) 2004, 2010 Red Hat, Inc. All rights reserved. 3 * Copyright (C) 2004, 2010-2011 Red Hat, Inc. All rights reserved.
4 * 4 *
5 * This file is released under the GPL. 5 * This file is released under the GPL.
6 */ 6 */
@@ -15,6 +15,9 @@
15 15
16#define DM_MSG_PREFIX "flakey" 16#define DM_MSG_PREFIX "flakey"
17 17
18#define all_corrupt_bio_flags_match(bio, fc) \
19 (((bio)->bi_rw & (fc)->corrupt_bio_flags) == (fc)->corrupt_bio_flags)
20
18/* 21/*
19 * Flakey: Used for testing only, simulates intermittent, 22 * Flakey: Used for testing only, simulates intermittent,
20 * catastrophic device failure. 23 * catastrophic device failure.
@@ -25,60 +28,191 @@ struct flakey_c {
25 sector_t start; 28 sector_t start;
26 unsigned up_interval; 29 unsigned up_interval;
27 unsigned down_interval; 30 unsigned down_interval;
31 unsigned long flags;
32 unsigned corrupt_bio_byte;
33 unsigned corrupt_bio_rw;
34 unsigned corrupt_bio_value;
35 unsigned corrupt_bio_flags;
36};
37
38enum feature_flag_bits {
39 DROP_WRITES
28}; 40};
29 41
42static int parse_features(struct dm_arg_set *as, struct flakey_c *fc,
43 struct dm_target *ti)
44{
45 int r;
46 unsigned argc;
47 const char *arg_name;
48
49 static struct dm_arg _args[] = {
50 {0, 6, "Invalid number of feature args"},
51 {1, UINT_MAX, "Invalid corrupt bio byte"},
52 {0, 255, "Invalid corrupt value to write into bio byte (0-255)"},
53 {0, UINT_MAX, "Invalid corrupt bio flags mask"},
54 };
55
56 /* No feature arguments supplied. */
57 if (!as->argc)
58 return 0;
59
60 r = dm_read_arg_group(_args, as, &argc, &ti->error);
61 if (r)
62 return r;
63
64 while (argc) {
65 arg_name = dm_shift_arg(as);
66 argc--;
67
68 /*
69 * drop_writes
70 */
71 if (!strcasecmp(arg_name, "drop_writes")) {
72 if (test_and_set_bit(DROP_WRITES, &fc->flags)) {
73 ti->error = "Feature drop_writes duplicated";
74 return -EINVAL;
75 }
76
77 continue;
78 }
79
80 /*
81 * corrupt_bio_byte <Nth_byte> <direction> <value> <bio_flags>
82 */
83 if (!strcasecmp(arg_name, "corrupt_bio_byte")) {
84 if (!argc) {
85 ti->error = "Feature corrupt_bio_byte requires parameters";
86 return -EINVAL;
87 }
88
89 r = dm_read_arg(_args + 1, as, &fc->corrupt_bio_byte, &ti->error);
90 if (r)
91 return r;
92 argc--;
93
94 /*
95 * Direction r or w?
96 */
97 arg_name = dm_shift_arg(as);
98 if (!strcasecmp(arg_name, "w"))
99 fc->corrupt_bio_rw = WRITE;
100 else if (!strcasecmp(arg_name, "r"))
101 fc->corrupt_bio_rw = READ;
102 else {
103 ti->error = "Invalid corrupt bio direction (r or w)";
104 return -EINVAL;
105 }
106 argc--;
107
108 /*
109 * Value of byte (0-255) to write in place of correct one.
110 */
111 r = dm_read_arg(_args + 2, as, &fc->corrupt_bio_value, &ti->error);
112 if (r)
113 return r;
114 argc--;
115
116 /*
117 * Only corrupt bios with these flags set.
118 */
119 r = dm_read_arg(_args + 3, as, &fc->corrupt_bio_flags, &ti->error);
120 if (r)
121 return r;
122 argc--;
123
124 continue;
125 }
126
127 ti->error = "Unrecognised flakey feature requested";
128 return -EINVAL;
129 }
130
131 if (test_bit(DROP_WRITES, &fc->flags) && (fc->corrupt_bio_rw == WRITE)) {
132 ti->error = "drop_writes is incompatible with corrupt_bio_byte with the WRITE flag set";
133 return -EINVAL;
134 }
135
136 return 0;
137}
138
30/* 139/*
31 * Construct a flakey mapping: <dev_path> <offset> <up interval> <down interval> 140 * Construct a flakey mapping:
141 * <dev_path> <offset> <up interval> <down interval> [<#feature args> [<arg>]*]
142 *
143 * Feature args:
144 * [drop_writes]
145 * [corrupt_bio_byte <Nth_byte> <direction> <value> <bio_flags>]
146 *
147 * Nth_byte starts from 1 for the first byte.
148 * Direction is r for READ or w for WRITE.
149 * bio_flags is ignored if 0.
32 */ 150 */
33static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv) 151static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
34{ 152{
153 static struct dm_arg _args[] = {
154 {0, UINT_MAX, "Invalid up interval"},
155 {0, UINT_MAX, "Invalid down interval"},
156 };
157
158 int r;
35 struct flakey_c *fc; 159 struct flakey_c *fc;
36 unsigned long long tmp; 160 unsigned long long tmpll;
161 struct dm_arg_set as;
162 const char *devname;
37 163
38 if (argc != 4) { 164 as.argc = argc;
39 ti->error = "dm-flakey: Invalid argument count"; 165 as.argv = argv;
166
167 if (argc < 4) {
168 ti->error = "Invalid argument count";
40 return -EINVAL; 169 return -EINVAL;
41 } 170 }
42 171
43 fc = kmalloc(sizeof(*fc), GFP_KERNEL); 172 fc = kzalloc(sizeof(*fc), GFP_KERNEL);
44 if (!fc) { 173 if (!fc) {
45 ti->error = "dm-flakey: Cannot allocate linear context"; 174 ti->error = "Cannot allocate linear context";
46 return -ENOMEM; 175 return -ENOMEM;
47 } 176 }
48 fc->start_time = jiffies; 177 fc->start_time = jiffies;
49 178
50 if (sscanf(argv[1], "%llu", &tmp) != 1) { 179 devname = dm_shift_arg(&as);
51 ti->error = "dm-flakey: Invalid device sector"; 180
181 if (sscanf(dm_shift_arg(&as), "%llu", &tmpll) != 1) {
182 ti->error = "Invalid device sector";
52 goto bad; 183 goto bad;
53 } 184 }
54 fc->start = tmp; 185 fc->start = tmpll;
55 186
56 if (sscanf(argv[2], "%u", &fc->up_interval) != 1) { 187 r = dm_read_arg(_args, &as, &fc->up_interval, &ti->error);
57 ti->error = "dm-flakey: Invalid up interval"; 188 if (r)
58 goto bad; 189 goto bad;
59 }
60 190
61 if (sscanf(argv[3], "%u", &fc->down_interval) != 1) { 191 r = dm_read_arg(_args, &as, &fc->down_interval, &ti->error);
62 ti->error = "dm-flakey: Invalid down interval"; 192 if (r)
63 goto bad; 193 goto bad;
64 }
65 194
66 if (!(fc->up_interval + fc->down_interval)) { 195 if (!(fc->up_interval + fc->down_interval)) {
67 ti->error = "dm-flakey: Total (up + down) interval is zero"; 196 ti->error = "Total (up + down) interval is zero";
68 goto bad; 197 goto bad;
69 } 198 }
70 199
71 if (fc->up_interval + fc->down_interval < fc->up_interval) { 200 if (fc->up_interval + fc->down_interval < fc->up_interval) {
72 ti->error = "dm-flakey: Interval overflow"; 201 ti->error = "Interval overflow";
73 goto bad; 202 goto bad;
74 } 203 }
75 204
76 if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &fc->dev)) { 205 r = parse_features(&as, fc, ti);
77 ti->error = "dm-flakey: Device lookup failed"; 206 if (r)
207 goto bad;
208
209 if (dm_get_device(ti, devname, dm_table_get_mode(ti->table), &fc->dev)) {
210 ti->error = "Device lookup failed";
78 goto bad; 211 goto bad;
79 } 212 }
80 213
81 ti->num_flush_requests = 1; 214 ti->num_flush_requests = 1;
215 ti->num_discard_requests = 1;
82 ti->private = fc; 216 ti->private = fc;
83 return 0; 217 return 0;
84 218
@@ -99,7 +233,7 @@ static sector_t flakey_map_sector(struct dm_target *ti, sector_t bi_sector)
99{ 233{
100 struct flakey_c *fc = ti->private; 234 struct flakey_c *fc = ti->private;
101 235
102 return fc->start + (bi_sector - ti->begin); 236 return fc->start + dm_target_offset(ti, bi_sector);
103} 237}
104 238
105static void flakey_map_bio(struct dm_target *ti, struct bio *bio) 239static void flakey_map_bio(struct dm_target *ti, struct bio *bio)
@@ -111,6 +245,25 @@ static void flakey_map_bio(struct dm_target *ti, struct bio *bio)
111 bio->bi_sector = flakey_map_sector(ti, bio->bi_sector); 245 bio->bi_sector = flakey_map_sector(ti, bio->bi_sector);
112} 246}
113 247
248static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc)
249{
250 unsigned bio_bytes = bio_cur_bytes(bio);
251 char *data = bio_data(bio);
252
253 /*
254 * Overwrite the Nth byte of the data returned.
255 */
256 if (data && bio_bytes >= fc->corrupt_bio_byte) {
257 data[fc->corrupt_bio_byte - 1] = fc->corrupt_bio_value;
258
259 DMDEBUG("Corrupting data bio=%p by writing %u to byte %u "
260 "(rw=%c bi_rw=%lu bi_sector=%llu cur_bytes=%u)\n",
261 bio, fc->corrupt_bio_value, fc->corrupt_bio_byte,
262 (bio_data_dir(bio) == WRITE) ? 'w' : 'r',
263 bio->bi_rw, (unsigned long long)bio->bi_sector, bio_bytes);
264 }
265}
266
114static int flakey_map(struct dm_target *ti, struct bio *bio, 267static int flakey_map(struct dm_target *ti, struct bio *bio,
115 union map_info *map_context) 268 union map_info *map_context)
116{ 269{
@@ -119,18 +272,71 @@ static int flakey_map(struct dm_target *ti, struct bio *bio,
119 272
120 /* Are we alive ? */ 273 /* Are we alive ? */
121 elapsed = (jiffies - fc->start_time) / HZ; 274 elapsed = (jiffies - fc->start_time) / HZ;
122 if (elapsed % (fc->up_interval + fc->down_interval) >= fc->up_interval) 275 if (elapsed % (fc->up_interval + fc->down_interval) >= fc->up_interval) {
276 /*
277 * Flag this bio as submitted while down.
278 */
279 map_context->ll = 1;
280
281 /*
282 * Map reads as normal.
283 */
284 if (bio_data_dir(bio) == READ)
285 goto map_bio;
286
287 /*
288 * Drop writes?
289 */
290 if (test_bit(DROP_WRITES, &fc->flags)) {
291 bio_endio(bio, 0);
292 return DM_MAPIO_SUBMITTED;
293 }
294
295 /*
296 * Corrupt matching writes.
297 */
298 if (fc->corrupt_bio_byte && (fc->corrupt_bio_rw == WRITE)) {
299 if (all_corrupt_bio_flags_match(bio, fc))
300 corrupt_bio_data(bio, fc);
301 goto map_bio;
302 }
303
304 /*
305 * By default, error all I/O.
306 */
123 return -EIO; 307 return -EIO;
308 }
124 309
310map_bio:
125 flakey_map_bio(ti, bio); 311 flakey_map_bio(ti, bio);
126 312
127 return DM_MAPIO_REMAPPED; 313 return DM_MAPIO_REMAPPED;
128} 314}
129 315
316static int flakey_end_io(struct dm_target *ti, struct bio *bio,
317 int error, union map_info *map_context)
318{
319 struct flakey_c *fc = ti->private;
320 unsigned bio_submitted_while_down = map_context->ll;
321
322 /*
323 * Corrupt successful READs while in down state.
324 * If flags were specified, only corrupt those that match.
325 */
326 if (!error && bio_submitted_while_down &&
327 (bio_data_dir(bio) == READ) && (fc->corrupt_bio_rw == READ) &&
328 all_corrupt_bio_flags_match(bio, fc))
329 corrupt_bio_data(bio, fc);
330
331 return error;
332}
333
130static int flakey_status(struct dm_target *ti, status_type_t type, 334static int flakey_status(struct dm_target *ti, status_type_t type,
131 char *result, unsigned int maxlen) 335 char *result, unsigned int maxlen)
132{ 336{
337 unsigned sz = 0;
133 struct flakey_c *fc = ti->private; 338 struct flakey_c *fc = ti->private;
339 unsigned drop_writes;
134 340
135 switch (type) { 341 switch (type) {
136 case STATUSTYPE_INFO: 342 case STATUSTYPE_INFO:
@@ -138,9 +344,22 @@ static int flakey_status(struct dm_target *ti, status_type_t type,
138 break; 344 break;
139 345
140 case STATUSTYPE_TABLE: 346 case STATUSTYPE_TABLE:
141 snprintf(result, maxlen, "%s %llu %u %u", fc->dev->name, 347 DMEMIT("%s %llu %u %u ", fc->dev->name,
142 (unsigned long long)fc->start, fc->up_interval, 348 (unsigned long long)fc->start, fc->up_interval,
143 fc->down_interval); 349 fc->down_interval);
350
351 drop_writes = test_bit(DROP_WRITES, &fc->flags);
352 DMEMIT("%u ", drop_writes + (fc->corrupt_bio_byte > 0) * 5);
353
354 if (drop_writes)
355 DMEMIT("drop_writes ");
356
357 if (fc->corrupt_bio_byte)
358 DMEMIT("corrupt_bio_byte %u %c %u %u ",
359 fc->corrupt_bio_byte,
360 (fc->corrupt_bio_rw == WRITE) ? 'w' : 'r',
361 fc->corrupt_bio_value, fc->corrupt_bio_flags);
362
144 break; 363 break;
145 } 364 }
146 return 0; 365 return 0;
@@ -177,11 +396,12 @@ static int flakey_iterate_devices(struct dm_target *ti, iterate_devices_callout_
177 396
178static struct target_type flakey_target = { 397static struct target_type flakey_target = {
179 .name = "flakey", 398 .name = "flakey",
180 .version = {1, 1, 0}, 399 .version = {1, 2, 0},
181 .module = THIS_MODULE, 400 .module = THIS_MODULE,
182 .ctr = flakey_ctr, 401 .ctr = flakey_ctr,
183 .dtr = flakey_dtr, 402 .dtr = flakey_dtr,
184 .map = flakey_map, 403 .map = flakey_map,
404 .end_io = flakey_end_io,
185 .status = flakey_status, 405 .status = flakey_status,
186 .ioctl = flakey_ioctl, 406 .ioctl = flakey_ioctl,
187 .merge = flakey_merge, 407 .merge = flakey_merge,
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 2067288f61f..ad2eba40e31 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -38,6 +38,8 @@ struct io {
38 struct dm_io_client *client; 38 struct dm_io_client *client;
39 io_notify_fn callback; 39 io_notify_fn callback;
40 void *context; 40 void *context;
41 void *vma_invalidate_address;
42 unsigned long vma_invalidate_size;
41} __attribute__((aligned(DM_IO_MAX_REGIONS))); 43} __attribute__((aligned(DM_IO_MAX_REGIONS)));
42 44
43static struct kmem_cache *_dm_io_cache; 45static struct kmem_cache *_dm_io_cache;
@@ -116,6 +118,10 @@ static void dec_count(struct io *io, unsigned int region, int error)
116 set_bit(region, &io->error_bits); 118 set_bit(region, &io->error_bits);
117 119
118 if (atomic_dec_and_test(&io->count)) { 120 if (atomic_dec_and_test(&io->count)) {
121 if (io->vma_invalidate_size)
122 invalidate_kernel_vmap_range(io->vma_invalidate_address,
123 io->vma_invalidate_size);
124
119 if (io->sleeper) 125 if (io->sleeper)
120 wake_up_process(io->sleeper); 126 wake_up_process(io->sleeper);
121 127
@@ -159,6 +165,9 @@ struct dpages {
159 165
160 unsigned context_u; 166 unsigned context_u;
161 void *context_ptr; 167 void *context_ptr;
168
169 void *vma_invalidate_address;
170 unsigned long vma_invalidate_size;
162}; 171};
163 172
164/* 173/*
@@ -377,6 +386,9 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
377 io->sleeper = current; 386 io->sleeper = current;
378 io->client = client; 387 io->client = client;
379 388
389 io->vma_invalidate_address = dp->vma_invalidate_address;
390 io->vma_invalidate_size = dp->vma_invalidate_size;
391
380 dispatch_io(rw, num_regions, where, dp, io, 1); 392 dispatch_io(rw, num_regions, where, dp, io, 1);
381 393
382 while (1) { 394 while (1) {
@@ -415,13 +427,21 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions,
415 io->callback = fn; 427 io->callback = fn;
416 io->context = context; 428 io->context = context;
417 429
430 io->vma_invalidate_address = dp->vma_invalidate_address;
431 io->vma_invalidate_size = dp->vma_invalidate_size;
432
418 dispatch_io(rw, num_regions, where, dp, io, 0); 433 dispatch_io(rw, num_regions, where, dp, io, 0);
419 return 0; 434 return 0;
420} 435}
421 436
422static int dp_init(struct dm_io_request *io_req, struct dpages *dp) 437static int dp_init(struct dm_io_request *io_req, struct dpages *dp,
438 unsigned long size)
423{ 439{
424 /* Set up dpages based on memory type */ 440 /* Set up dpages based on memory type */
441
442 dp->vma_invalidate_address = NULL;
443 dp->vma_invalidate_size = 0;
444
425 switch (io_req->mem.type) { 445 switch (io_req->mem.type) {
426 case DM_IO_PAGE_LIST: 446 case DM_IO_PAGE_LIST:
427 list_dp_init(dp, io_req->mem.ptr.pl, io_req->mem.offset); 447 list_dp_init(dp, io_req->mem.ptr.pl, io_req->mem.offset);
@@ -432,6 +452,11 @@ static int dp_init(struct dm_io_request *io_req, struct dpages *dp)
432 break; 452 break;
433 453
434 case DM_IO_VMA: 454 case DM_IO_VMA:
455 flush_kernel_vmap_range(io_req->mem.ptr.vma, size);
456 if ((io_req->bi_rw & RW_MASK) == READ) {
457 dp->vma_invalidate_address = io_req->mem.ptr.vma;
458 dp->vma_invalidate_size = size;
459 }
435 vm_dp_init(dp, io_req->mem.ptr.vma); 460 vm_dp_init(dp, io_req->mem.ptr.vma);
436 break; 461 break;
437 462
@@ -460,7 +485,7 @@ int dm_io(struct dm_io_request *io_req, unsigned num_regions,
460 int r; 485 int r;
461 struct dpages dp; 486 struct dpages dp;
462 487
463 r = dp_init(io_req, &dp); 488 r = dp_init(io_req, &dp, (unsigned long)where->count << SECTOR_SHIFT);
464 if (r) 489 if (r)
465 return r; 490 return r;
466 491
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 4cacdad2270..2e9a3ca37bd 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -128,6 +128,24 @@ static struct hash_cell *__get_uuid_cell(const char *str)
128 return NULL; 128 return NULL;
129} 129}
130 130
131static struct hash_cell *__get_dev_cell(uint64_t dev)
132{
133 struct mapped_device *md;
134 struct hash_cell *hc;
135
136 md = dm_get_md(huge_decode_dev(dev));
137 if (!md)
138 return NULL;
139
140 hc = dm_get_mdptr(md);
141 if (!hc) {
142 dm_put(md);
143 return NULL;
144 }
145
146 return hc;
147}
148
131/*----------------------------------------------------------------- 149/*-----------------------------------------------------------------
132 * Inserting, removing and renaming a device. 150 * Inserting, removing and renaming a device.
133 *---------------------------------------------------------------*/ 151 *---------------------------------------------------------------*/
@@ -718,25 +736,45 @@ static int dev_create(struct dm_ioctl *param, size_t param_size)
718 */ 736 */
719static struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param) 737static struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param)
720{ 738{
721 struct mapped_device *md; 739 struct hash_cell *hc = NULL;
722 void *mdptr = NULL;
723 740
724 if (*param->uuid) 741 if (*param->uuid) {
725 return __get_uuid_cell(param->uuid); 742 if (*param->name || param->dev)
743 return NULL;
726 744
727 if (*param->name) 745 hc = __get_uuid_cell(param->uuid);
728 return __get_name_cell(param->name); 746 if (!hc)
747 return NULL;
748 } else if (*param->name) {
749 if (param->dev)
750 return NULL;
729 751
730 md = dm_get_md(huge_decode_dev(param->dev)); 752 hc = __get_name_cell(param->name);
731 if (!md) 753 if (!hc)
732 goto out; 754 return NULL;
755 } else if (param->dev) {
756 hc = __get_dev_cell(param->dev);
757 if (!hc)
758 return NULL;
759 } else
760 return NULL;
733 761
734 mdptr = dm_get_mdptr(md); 762 /*
735 if (!mdptr) 763 * Sneakily write in both the name and the uuid
736 dm_put(md); 764 * while we have the cell.
765 */
766 strlcpy(param->name, hc->name, sizeof(param->name));
767 if (hc->uuid)
768 strlcpy(param->uuid, hc->uuid, sizeof(param->uuid));
769 else
770 param->uuid[0] = '\0';
737 771
738out: 772 if (hc->new_map)
739 return mdptr; 773 param->flags |= DM_INACTIVE_PRESENT_FLAG;
774 else
775 param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
776
777 return hc;
740} 778}
741 779
742static struct mapped_device *find_device(struct dm_ioctl *param) 780static struct mapped_device *find_device(struct dm_ioctl *param)
@@ -746,24 +784,8 @@ static struct mapped_device *find_device(struct dm_ioctl *param)
746 784
747 down_read(&_hash_lock); 785 down_read(&_hash_lock);
748 hc = __find_device_hash_cell(param); 786 hc = __find_device_hash_cell(param);
749 if (hc) { 787 if (hc)
750 md = hc->md; 788 md = hc->md;
751
752 /*
753 * Sneakily write in both the name and the uuid
754 * while we have the cell.
755 */
756 strlcpy(param->name, hc->name, sizeof(param->name));
757 if (hc->uuid)
758 strlcpy(param->uuid, hc->uuid, sizeof(param->uuid));
759 else
760 param->uuid[0] = '\0';
761
762 if (hc->new_map)
763 param->flags |= DM_INACTIVE_PRESENT_FLAG;
764 else
765 param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
766 }
767 up_read(&_hash_lock); 789 up_read(&_hash_lock);
768 790
769 return md; 791 return md;
@@ -1402,6 +1424,11 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
1402 goto out; 1424 goto out;
1403 } 1425 }
1404 1426
1427 if (!argc) {
1428 DMWARN("Empty message received.");
1429 goto out;
1430 }
1431
1405 table = dm_get_live_table(md); 1432 table = dm_get_live_table(md);
1406 if (!table) 1433 if (!table)
1407 goto out_argv; 1434 goto out_argv;
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 819e37eaaeb..32ac70861d6 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -10,7 +10,7 @@
10 */ 10 */
11 11
12#include <linux/types.h> 12#include <linux/types.h>
13#include <asm/atomic.h> 13#include <linux/atomic.h>
14#include <linux/blkdev.h> 14#include <linux/blkdev.h>
15#include <linux/fs.h> 15#include <linux/fs.h>
16#include <linux/init.h> 16#include <linux/init.h>
@@ -224,8 +224,6 @@ struct kcopyd_job {
224 unsigned int num_dests; 224 unsigned int num_dests;
225 struct dm_io_region dests[DM_KCOPYD_MAX_REGIONS]; 225 struct dm_io_region dests[DM_KCOPYD_MAX_REGIONS];
226 226
227 sector_t offset;
228 unsigned int nr_pages;
229 struct page_list *pages; 227 struct page_list *pages;
230 228
231 /* 229 /*
@@ -380,7 +378,7 @@ static int run_io_job(struct kcopyd_job *job)
380 .bi_rw = job->rw, 378 .bi_rw = job->rw,
381 .mem.type = DM_IO_PAGE_LIST, 379 .mem.type = DM_IO_PAGE_LIST,
382 .mem.ptr.pl = job->pages, 380 .mem.ptr.pl = job->pages,
383 .mem.offset = job->offset, 381 .mem.offset = 0,
384 .notify.fn = complete_io, 382 .notify.fn = complete_io,
385 .notify.context = job, 383 .notify.context = job,
386 .client = job->kc->io_client, 384 .client = job->kc->io_client,
@@ -397,10 +395,9 @@ static int run_io_job(struct kcopyd_job *job)
397static int run_pages_job(struct kcopyd_job *job) 395static int run_pages_job(struct kcopyd_job *job)
398{ 396{
399 int r; 397 int r;
398 unsigned nr_pages = dm_div_up(job->dests[0].count, PAGE_SIZE >> 9);
400 399
401 job->nr_pages = dm_div_up(job->dests[0].count + job->offset, 400 r = kcopyd_get_pages(job->kc, nr_pages, &job->pages);
402 PAGE_SIZE >> 9);
403 r = kcopyd_get_pages(job->kc, job->nr_pages, &job->pages);
404 if (!r) { 401 if (!r) {
405 /* this job is ready for io */ 402 /* this job is ready for io */
406 push(&job->kc->io_jobs, job); 403 push(&job->kc->io_jobs, job);
@@ -602,8 +599,6 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
602 job->num_dests = num_dests; 599 job->num_dests = num_dests;
603 memcpy(&job->dests, dests, sizeof(*dests) * num_dests); 600 memcpy(&job->dests, dests, sizeof(*dests) * num_dests);
604 601
605 job->offset = 0;
606 job->nr_pages = 0;
607 job->pages = NULL; 602 job->pages = NULL;
608 603
609 job->fn = fn; 604 job->fn = fn;
@@ -622,6 +617,38 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
622} 617}
623EXPORT_SYMBOL(dm_kcopyd_copy); 618EXPORT_SYMBOL(dm_kcopyd_copy);
624 619
620void *dm_kcopyd_prepare_callback(struct dm_kcopyd_client *kc,
621 dm_kcopyd_notify_fn fn, void *context)
622{
623 struct kcopyd_job *job;
624
625 job = mempool_alloc(kc->job_pool, GFP_NOIO);
626
627 memset(job, 0, sizeof(struct kcopyd_job));
628 job->kc = kc;
629 job->fn = fn;
630 job->context = context;
631 job->master_job = job;
632
633 atomic_inc(&kc->nr_jobs);
634
635 return job;
636}
637EXPORT_SYMBOL(dm_kcopyd_prepare_callback);
638
639void dm_kcopyd_do_callback(void *j, int read_err, unsigned long write_err)
640{
641 struct kcopyd_job *job = j;
642 struct dm_kcopyd_client *kc = job->kc;
643
644 job->read_err = read_err;
645 job->write_err = write_err;
646
647 push(&kc->complete_jobs, job);
648 wake(kc);
649}
650EXPORT_SYMBOL(dm_kcopyd_do_callback);
651
625/* 652/*
626 * Cancels a kcopyd job, eg. someone might be deactivating a 653 * Cancels a kcopyd job, eg. someone might be deactivating a
627 * mirror. 654 * mirror.
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index aa2e0c374ab..1021c898601 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -394,8 +394,7 @@ static int flush_by_group(struct log_c *lc, struct list_head *flush_list)
394 group[count] = fe->region; 394 group[count] = fe->region;
395 count++; 395 count++;
396 396
397 list_del(&fe->list); 397 list_move(&fe->list, &tmp_list);
398 list_add(&fe->list, &tmp_list);
399 398
400 type = fe->type; 399 type = fe->type;
401 if (count >= MAX_FLUSH_GROUP_COUNT) 400 if (count >= MAX_FLUSH_GROUP_COUNT)
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 948e3f4925b..3b52bb72bd1 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -197,15 +197,21 @@ EXPORT_SYMBOL(dm_dirty_log_destroy);
197#define MIRROR_DISK_VERSION 2 197#define MIRROR_DISK_VERSION 2
198#define LOG_OFFSET 2 198#define LOG_OFFSET 2
199 199
200struct log_header { 200struct log_header_disk {
201 uint32_t magic; 201 __le32 magic;
202 202
203 /* 203 /*
204 * Simple, incrementing version. no backward 204 * Simple, incrementing version. no backward
205 * compatibility. 205 * compatibility.
206 */ 206 */
207 __le32 version;
208 __le64 nr_regions;
209} __packed;
210
211struct log_header_core {
212 uint32_t magic;
207 uint32_t version; 213 uint32_t version;
208 sector_t nr_regions; 214 uint64_t nr_regions;
209}; 215};
210 216
211struct log_c { 217struct log_c {
@@ -239,10 +245,10 @@ struct log_c {
239 int log_dev_failed; 245 int log_dev_failed;
240 int log_dev_flush_failed; 246 int log_dev_flush_failed;
241 struct dm_dev *log_dev; 247 struct dm_dev *log_dev;
242 struct log_header header; 248 struct log_header_core header;
243 249
244 struct dm_io_region header_location; 250 struct dm_io_region header_location;
245 struct log_header *disk_header; 251 struct log_header_disk *disk_header;
246}; 252};
247 253
248/* 254/*
@@ -251,34 +257,34 @@ struct log_c {
251 */ 257 */
252static inline int log_test_bit(uint32_t *bs, unsigned bit) 258static inline int log_test_bit(uint32_t *bs, unsigned bit)
253{ 259{
254 return test_bit_le(bit, (unsigned long *) bs) ? 1 : 0; 260 return test_bit_le(bit, bs) ? 1 : 0;
255} 261}
256 262
257static inline void log_set_bit(struct log_c *l, 263static inline void log_set_bit(struct log_c *l,
258 uint32_t *bs, unsigned bit) 264 uint32_t *bs, unsigned bit)
259{ 265{
260 __test_and_set_bit_le(bit, (unsigned long *) bs); 266 __set_bit_le(bit, bs);
261 l->touched_cleaned = 1; 267 l->touched_cleaned = 1;
262} 268}
263 269
264static inline void log_clear_bit(struct log_c *l, 270static inline void log_clear_bit(struct log_c *l,
265 uint32_t *bs, unsigned bit) 271 uint32_t *bs, unsigned bit)
266{ 272{
267 __test_and_clear_bit_le(bit, (unsigned long *) bs); 273 __clear_bit_le(bit, bs);
268 l->touched_dirtied = 1; 274 l->touched_dirtied = 1;
269} 275}
270 276
271/*---------------------------------------------------------------- 277/*----------------------------------------------------------------
272 * Header IO 278 * Header IO
273 *--------------------------------------------------------------*/ 279 *--------------------------------------------------------------*/
274static void header_to_disk(struct log_header *core, struct log_header *disk) 280static void header_to_disk(struct log_header_core *core, struct log_header_disk *disk)
275{ 281{
276 disk->magic = cpu_to_le32(core->magic); 282 disk->magic = cpu_to_le32(core->magic);
277 disk->version = cpu_to_le32(core->version); 283 disk->version = cpu_to_le32(core->version);
278 disk->nr_regions = cpu_to_le64(core->nr_regions); 284 disk->nr_regions = cpu_to_le64(core->nr_regions);
279} 285}
280 286
281static void header_from_disk(struct log_header *core, struct log_header *disk) 287static void header_from_disk(struct log_header_core *core, struct log_header_disk *disk)
282{ 288{
283 core->magic = le32_to_cpu(disk->magic); 289 core->magic = le32_to_cpu(disk->magic);
284 core->version = le32_to_cpu(disk->version); 290 core->version = le32_to_cpu(disk->version);
@@ -486,7 +492,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
486 memset(lc->sync_bits, (sync == NOSYNC) ? -1 : 0, bitset_size); 492 memset(lc->sync_bits, (sync == NOSYNC) ? -1 : 0, bitset_size);
487 lc->sync_count = (sync == NOSYNC) ? region_count : 0; 493 lc->sync_count = (sync == NOSYNC) ? region_count : 0;
488 494
489 lc->recovering_bits = vmalloc(bitset_size); 495 lc->recovering_bits = vzalloc(bitset_size);
490 if (!lc->recovering_bits) { 496 if (!lc->recovering_bits) {
491 DMWARN("couldn't allocate sync bitset"); 497 DMWARN("couldn't allocate sync bitset");
492 vfree(lc->sync_bits); 498 vfree(lc->sync_bits);
@@ -498,7 +504,6 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
498 kfree(lc); 504 kfree(lc);
499 return -ENOMEM; 505 return -ENOMEM;
500 } 506 }
501 memset(lc->recovering_bits, 0, bitset_size);
502 lc->sync_search = 0; 507 lc->sync_search = 0;
503 log->context = lc; 508 log->context = lc;
504 509
@@ -739,8 +744,7 @@ static int core_get_resync_work(struct dm_dirty_log *log, region_t *region)
739 return 0; 744 return 0;
740 745
741 do { 746 do {
742 *region = find_next_zero_bit_le( 747 *region = find_next_zero_bit_le(lc->sync_bits,
743 (unsigned long *) lc->sync_bits,
744 lc->region_count, 748 lc->region_count,
745 lc->sync_search); 749 lc->sync_search);
746 lc->sync_search = *region + 1; 750 lc->sync_search = *region + 1;
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index aa4e570c2cb..5e0090ef418 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -19,10 +19,9 @@
19#include <linux/time.h> 19#include <linux/time.h>
20#include <linux/workqueue.h> 20#include <linux/workqueue.h>
21#include <scsi/scsi_dh.h> 21#include <scsi/scsi_dh.h>
22#include <asm/atomic.h> 22#include <linux/atomic.h>
23 23
24#define DM_MSG_PREFIX "multipath" 24#define DM_MSG_PREFIX "multipath"
25#define MESG_STR(x) x, sizeof(x)
26#define DM_PG_INIT_DELAY_MSECS 2000 25#define DM_PG_INIT_DELAY_MSECS 2000
27#define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1) 26#define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1)
28 27
@@ -505,80 +504,29 @@ static void trigger_event(struct work_struct *work)
505 * <#paths> <#per-path selector args> 504 * <#paths> <#per-path selector args>
506 * [<path> [<arg>]* ]+ ]+ 505 * [<path> [<arg>]* ]+ ]+
507 *---------------------------------------------------------------*/ 506 *---------------------------------------------------------------*/
508struct param { 507static int parse_path_selector(struct dm_arg_set *as, struct priority_group *pg,
509 unsigned min;
510 unsigned max;
511 char *error;
512};
513
514static int read_param(struct param *param, char *str, unsigned *v, char **error)
515{
516 if (!str ||
517 (sscanf(str, "%u", v) != 1) ||
518 (*v < param->min) ||
519 (*v > param->max)) {
520 *error = param->error;
521 return -EINVAL;
522 }
523
524 return 0;
525}
526
527struct arg_set {
528 unsigned argc;
529 char **argv;
530};
531
532static char *shift(struct arg_set *as)
533{
534 char *r;
535
536 if (as->argc) {
537 as->argc--;
538 r = *as->argv;
539 as->argv++;
540 return r;
541 }
542
543 return NULL;
544}
545
546static void consume(struct arg_set *as, unsigned n)
547{
548 BUG_ON (as->argc < n);
549 as->argc -= n;
550 as->argv += n;
551}
552
553static int parse_path_selector(struct arg_set *as, struct priority_group *pg,
554 struct dm_target *ti) 508 struct dm_target *ti)
555{ 509{
556 int r; 510 int r;
557 struct path_selector_type *pst; 511 struct path_selector_type *pst;
558 unsigned ps_argc; 512 unsigned ps_argc;
559 513
560 static struct param _params[] = { 514 static struct dm_arg _args[] = {
561 {0, 1024, "invalid number of path selector args"}, 515 {0, 1024, "invalid number of path selector args"},
562 }; 516 };
563 517
564 pst = dm_get_path_selector(shift(as)); 518 pst = dm_get_path_selector(dm_shift_arg(as));
565 if (!pst) { 519 if (!pst) {
566 ti->error = "unknown path selector type"; 520 ti->error = "unknown path selector type";
567 return -EINVAL; 521 return -EINVAL;
568 } 522 }
569 523
570 r = read_param(_params, shift(as), &ps_argc, &ti->error); 524 r = dm_read_arg_group(_args, as, &ps_argc, &ti->error);
571 if (r) { 525 if (r) {
572 dm_put_path_selector(pst); 526 dm_put_path_selector(pst);
573 return -EINVAL; 527 return -EINVAL;
574 } 528 }
575 529
576 if (ps_argc > as->argc) {
577 dm_put_path_selector(pst);
578 ti->error = "not enough arguments for path selector";
579 return -EINVAL;
580 }
581
582 r = pst->create(&pg->ps, ps_argc, as->argv); 530 r = pst->create(&pg->ps, ps_argc, as->argv);
583 if (r) { 531 if (r) {
584 dm_put_path_selector(pst); 532 dm_put_path_selector(pst);
@@ -587,12 +535,12 @@ static int parse_path_selector(struct arg_set *as, struct priority_group *pg,
587 } 535 }
588 536
589 pg->ps.type = pst; 537 pg->ps.type = pst;
590 consume(as, ps_argc); 538 dm_consume_args(as, ps_argc);
591 539
592 return 0; 540 return 0;
593} 541}
594 542
595static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps, 543static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps,
596 struct dm_target *ti) 544 struct dm_target *ti)
597{ 545{
598 int r; 546 int r;
@@ -609,7 +557,7 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps,
609 if (!p) 557 if (!p)
610 return ERR_PTR(-ENOMEM); 558 return ERR_PTR(-ENOMEM);
611 559
612 r = dm_get_device(ti, shift(as), dm_table_get_mode(ti->table), 560 r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
613 &p->path.dev); 561 &p->path.dev);
614 if (r) { 562 if (r) {
615 ti->error = "error getting device"; 563 ti->error = "error getting device";
@@ -660,16 +608,16 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps,
660 return ERR_PTR(r); 608 return ERR_PTR(r);
661} 609}
662 610
663static struct priority_group *parse_priority_group(struct arg_set *as, 611static struct priority_group *parse_priority_group(struct dm_arg_set *as,
664 struct multipath *m) 612 struct multipath *m)
665{ 613{
666 static struct param _params[] = { 614 static struct dm_arg _args[] = {
667 {1, 1024, "invalid number of paths"}, 615 {1, 1024, "invalid number of paths"},
668 {0, 1024, "invalid number of selector args"} 616 {0, 1024, "invalid number of selector args"}
669 }; 617 };
670 618
671 int r; 619 int r;
672 unsigned i, nr_selector_args, nr_params; 620 unsigned i, nr_selector_args, nr_args;
673 struct priority_group *pg; 621 struct priority_group *pg;
674 struct dm_target *ti = m->ti; 622 struct dm_target *ti = m->ti;
675 623
@@ -693,26 +641,26 @@ static struct priority_group *parse_priority_group(struct arg_set *as,
693 /* 641 /*
694 * read the paths 642 * read the paths
695 */ 643 */
696 r = read_param(_params, shift(as), &pg->nr_pgpaths, &ti->error); 644 r = dm_read_arg(_args, as, &pg->nr_pgpaths, &ti->error);
697 if (r) 645 if (r)
698 goto bad; 646 goto bad;
699 647
700 r = read_param(_params + 1, shift(as), &nr_selector_args, &ti->error); 648 r = dm_read_arg(_args + 1, as, &nr_selector_args, &ti->error);
701 if (r) 649 if (r)
702 goto bad; 650 goto bad;
703 651
704 nr_params = 1 + nr_selector_args; 652 nr_args = 1 + nr_selector_args;
705 for (i = 0; i < pg->nr_pgpaths; i++) { 653 for (i = 0; i < pg->nr_pgpaths; i++) {
706 struct pgpath *pgpath; 654 struct pgpath *pgpath;
707 struct arg_set path_args; 655 struct dm_arg_set path_args;
708 656
709 if (as->argc < nr_params) { 657 if (as->argc < nr_args) {
710 ti->error = "not enough path parameters"; 658 ti->error = "not enough path parameters";
711 r = -EINVAL; 659 r = -EINVAL;
712 goto bad; 660 goto bad;
713 } 661 }
714 662
715 path_args.argc = nr_params; 663 path_args.argc = nr_args;
716 path_args.argv = as->argv; 664 path_args.argv = as->argv;
717 665
718 pgpath = parse_path(&path_args, &pg->ps, ti); 666 pgpath = parse_path(&path_args, &pg->ps, ti);
@@ -723,7 +671,7 @@ static struct priority_group *parse_priority_group(struct arg_set *as,
723 671
724 pgpath->pg = pg; 672 pgpath->pg = pg;
725 list_add_tail(&pgpath->list, &pg->pgpaths); 673 list_add_tail(&pgpath->list, &pg->pgpaths);
726 consume(as, nr_params); 674 dm_consume_args(as, nr_args);
727 } 675 }
728 676
729 return pg; 677 return pg;
@@ -733,28 +681,23 @@ static struct priority_group *parse_priority_group(struct arg_set *as,
733 return ERR_PTR(r); 681 return ERR_PTR(r);
734} 682}
735 683
736static int parse_hw_handler(struct arg_set *as, struct multipath *m) 684static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m)
737{ 685{
738 unsigned hw_argc; 686 unsigned hw_argc;
739 int ret; 687 int ret;
740 struct dm_target *ti = m->ti; 688 struct dm_target *ti = m->ti;
741 689
742 static struct param _params[] = { 690 static struct dm_arg _args[] = {
743 {0, 1024, "invalid number of hardware handler args"}, 691 {0, 1024, "invalid number of hardware handler args"},
744 }; 692 };
745 693
746 if (read_param(_params, shift(as), &hw_argc, &ti->error)) 694 if (dm_read_arg_group(_args, as, &hw_argc, &ti->error))
747 return -EINVAL; 695 return -EINVAL;
748 696
749 if (!hw_argc) 697 if (!hw_argc)
750 return 0; 698 return 0;
751 699
752 if (hw_argc > as->argc) { 700 m->hw_handler_name = kstrdup(dm_shift_arg(as), GFP_KERNEL);
753 ti->error = "not enough arguments for hardware handler";
754 return -EINVAL;
755 }
756
757 m->hw_handler_name = kstrdup(shift(as), GFP_KERNEL);
758 request_module("scsi_dh_%s", m->hw_handler_name); 701 request_module("scsi_dh_%s", m->hw_handler_name);
759 if (scsi_dh_handler_exist(m->hw_handler_name) == 0) { 702 if (scsi_dh_handler_exist(m->hw_handler_name) == 0) {
760 ti->error = "unknown hardware handler type"; 703 ti->error = "unknown hardware handler type";
@@ -778,7 +721,7 @@ static int parse_hw_handler(struct arg_set *as, struct multipath *m)
778 for (i = 0, p+=j+1; i <= hw_argc - 2; i++, p+=j+1) 721 for (i = 0, p+=j+1; i <= hw_argc - 2; i++, p+=j+1)
779 j = sprintf(p, "%s", as->argv[i]); 722 j = sprintf(p, "%s", as->argv[i]);
780 } 723 }
781 consume(as, hw_argc - 1); 724 dm_consume_args(as, hw_argc - 1);
782 725
783 return 0; 726 return 0;
784fail: 727fail:
@@ -787,20 +730,20 @@ fail:
787 return ret; 730 return ret;
788} 731}
789 732
790static int parse_features(struct arg_set *as, struct multipath *m) 733static int parse_features(struct dm_arg_set *as, struct multipath *m)
791{ 734{
792 int r; 735 int r;
793 unsigned argc; 736 unsigned argc;
794 struct dm_target *ti = m->ti; 737 struct dm_target *ti = m->ti;
795 const char *param_name; 738 const char *arg_name;
796 739
797 static struct param _params[] = { 740 static struct dm_arg _args[] = {
798 {0, 5, "invalid number of feature args"}, 741 {0, 5, "invalid number of feature args"},
799 {1, 50, "pg_init_retries must be between 1 and 50"}, 742 {1, 50, "pg_init_retries must be between 1 and 50"},
800 {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"}, 743 {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"},
801 }; 744 };
802 745
803 r = read_param(_params, shift(as), &argc, &ti->error); 746 r = dm_read_arg_group(_args, as, &argc, &ti->error);
804 if (r) 747 if (r)
805 return -EINVAL; 748 return -EINVAL;
806 749
@@ -808,26 +751,24 @@ static int parse_features(struct arg_set *as, struct multipath *m)
808 return 0; 751 return 0;
809 752
810 do { 753 do {
811 param_name = shift(as); 754 arg_name = dm_shift_arg(as);
812 argc--; 755 argc--;
813 756
814 if (!strnicmp(param_name, MESG_STR("queue_if_no_path"))) { 757 if (!strcasecmp(arg_name, "queue_if_no_path")) {
815 r = queue_if_no_path(m, 1, 0); 758 r = queue_if_no_path(m, 1, 0);
816 continue; 759 continue;
817 } 760 }
818 761
819 if (!strnicmp(param_name, MESG_STR("pg_init_retries")) && 762 if (!strcasecmp(arg_name, "pg_init_retries") &&
820 (argc >= 1)) { 763 (argc >= 1)) {
821 r = read_param(_params + 1, shift(as), 764 r = dm_read_arg(_args + 1, as, &m->pg_init_retries, &ti->error);
822 &m->pg_init_retries, &ti->error);
823 argc--; 765 argc--;
824 continue; 766 continue;
825 } 767 }
826 768
827 if (!strnicmp(param_name, MESG_STR("pg_init_delay_msecs")) && 769 if (!strcasecmp(arg_name, "pg_init_delay_msecs") &&
828 (argc >= 1)) { 770 (argc >= 1)) {
829 r = read_param(_params + 2, shift(as), 771 r = dm_read_arg(_args + 2, as, &m->pg_init_delay_msecs, &ti->error);
830 &m->pg_init_delay_msecs, &ti->error);
831 argc--; 772 argc--;
832 continue; 773 continue;
833 } 774 }
@@ -842,15 +783,15 @@ static int parse_features(struct arg_set *as, struct multipath *m)
842static int multipath_ctr(struct dm_target *ti, unsigned int argc, 783static int multipath_ctr(struct dm_target *ti, unsigned int argc,
843 char **argv) 784 char **argv)
844{ 785{
845 /* target parameters */ 786 /* target arguments */
846 static struct param _params[] = { 787 static struct dm_arg _args[] = {
847 {0, 1024, "invalid number of priority groups"}, 788 {0, 1024, "invalid number of priority groups"},
848 {0, 1024, "invalid initial priority group number"}, 789 {0, 1024, "invalid initial priority group number"},
849 }; 790 };
850 791
851 int r; 792 int r;
852 struct multipath *m; 793 struct multipath *m;
853 struct arg_set as; 794 struct dm_arg_set as;
854 unsigned pg_count = 0; 795 unsigned pg_count = 0;
855 unsigned next_pg_num; 796 unsigned next_pg_num;
856 797
@@ -871,11 +812,11 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
871 if (r) 812 if (r)
872 goto bad; 813 goto bad;
873 814
874 r = read_param(_params, shift(&as), &m->nr_priority_groups, &ti->error); 815 r = dm_read_arg(_args, &as, &m->nr_priority_groups, &ti->error);
875 if (r) 816 if (r)
876 goto bad; 817 goto bad;
877 818
878 r = read_param(_params + 1, shift(&as), &next_pg_num, &ti->error); 819 r = dm_read_arg(_args + 1, &as, &next_pg_num, &ti->error);
879 if (r) 820 if (r)
880 goto bad; 821 goto bad;
881 822
@@ -1505,10 +1446,10 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
1505 } 1446 }
1506 1447
1507 if (argc == 1) { 1448 if (argc == 1) {
1508 if (!strnicmp(argv[0], MESG_STR("queue_if_no_path"))) { 1449 if (!strcasecmp(argv[0], "queue_if_no_path")) {
1509 r = queue_if_no_path(m, 1, 0); 1450 r = queue_if_no_path(m, 1, 0);
1510 goto out; 1451 goto out;
1511 } else if (!strnicmp(argv[0], MESG_STR("fail_if_no_path"))) { 1452 } else if (!strcasecmp(argv[0], "fail_if_no_path")) {
1512 r = queue_if_no_path(m, 0, 0); 1453 r = queue_if_no_path(m, 0, 0);
1513 goto out; 1454 goto out;
1514 } 1455 }
@@ -1519,18 +1460,18 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
1519 goto out; 1460 goto out;
1520 } 1461 }
1521 1462
1522 if (!strnicmp(argv[0], MESG_STR("disable_group"))) { 1463 if (!strcasecmp(argv[0], "disable_group")) {
1523 r = bypass_pg_num(m, argv[1], 1); 1464 r = bypass_pg_num(m, argv[1], 1);
1524 goto out; 1465 goto out;
1525 } else if (!strnicmp(argv[0], MESG_STR("enable_group"))) { 1466 } else if (!strcasecmp(argv[0], "enable_group")) {
1526 r = bypass_pg_num(m, argv[1], 0); 1467 r = bypass_pg_num(m, argv[1], 0);
1527 goto out; 1468 goto out;
1528 } else if (!strnicmp(argv[0], MESG_STR("switch_group"))) { 1469 } else if (!strcasecmp(argv[0], "switch_group")) {
1529 r = switch_pg_num(m, argv[1]); 1470 r = switch_pg_num(m, argv[1]);
1530 goto out; 1471 goto out;
1531 } else if (!strnicmp(argv[0], MESG_STR("reinstate_path"))) 1472 } else if (!strcasecmp(argv[0], "reinstate_path"))
1532 action = reinstate_path; 1473 action = reinstate_path;
1533 else if (!strnicmp(argv[0], MESG_STR("fail_path"))) 1474 else if (!strcasecmp(argv[0], "fail_path"))
1534 action = fail_path; 1475 action = fail_path;
1535 else { 1476 else {
1536 DMWARN("Unrecognised multipath message received."); 1477 DMWARN("Unrecognised multipath message received.");
diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c
index f92b6cea9d9..03a837aa5ce 100644
--- a/drivers/md/dm-queue-length.c
+++ b/drivers/md/dm-queue-length.c
@@ -20,7 +20,7 @@
20#include <linux/ctype.h> 20#include <linux/ctype.h>
21#include <linux/errno.h> 21#include <linux/errno.h>
22#include <linux/module.h> 22#include <linux/module.h>
23#include <asm/atomic.h> 23#include <linux/atomic.h>
24 24
25#define DM_MSG_PREFIX "multipath queue-length" 25#define DM_MSG_PREFIX "multipath queue-length"
26#define QL_MIN_IO 128 26#define QL_MIN_IO 128
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index e5d8904fc8f..86df8b2cf92 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -8,19 +8,19 @@
8#include <linux/slab.h> 8#include <linux/slab.h>
9 9
10#include "md.h" 10#include "md.h"
11#include "raid1.h"
11#include "raid5.h" 12#include "raid5.h"
12#include "dm.h"
13#include "bitmap.h" 13#include "bitmap.h"
14 14
15#include <linux/device-mapper.h>
16
15#define DM_MSG_PREFIX "raid" 17#define DM_MSG_PREFIX "raid"
16 18
17/* 19/*
18 * If the MD doesn't support MD_SYNC_STATE_FORCED yet, then 20 * The following flags are used by dm-raid.c to set up the array state.
19 * make it so the flag doesn't set anything. 21 * They must be cleared before md_run is called.
20 */ 22 */
21#ifndef MD_SYNC_STATE_FORCED 23#define FirstUse 10 /* rdev flag */
22#define MD_SYNC_STATE_FORCED 0
23#endif
24 24
25struct raid_dev { 25struct raid_dev {
26 /* 26 /*
@@ -43,14 +43,15 @@ struct raid_dev {
43/* 43/*
44 * Flags for rs->print_flags field. 44 * Flags for rs->print_flags field.
45 */ 45 */
46#define DMPF_DAEMON_SLEEP 0x1 46#define DMPF_SYNC 0x1
47#define DMPF_MAX_WRITE_BEHIND 0x2 47#define DMPF_NOSYNC 0x2
48#define DMPF_SYNC 0x4 48#define DMPF_REBUILD 0x4
49#define DMPF_NOSYNC 0x8 49#define DMPF_DAEMON_SLEEP 0x8
50#define DMPF_STRIPE_CACHE 0x10 50#define DMPF_MIN_RECOVERY_RATE 0x10
51#define DMPF_MIN_RECOVERY_RATE 0x20 51#define DMPF_MAX_RECOVERY_RATE 0x20
52#define DMPF_MAX_RECOVERY_RATE 0x40 52#define DMPF_MAX_WRITE_BEHIND 0x40
53 53#define DMPF_STRIPE_CACHE 0x80
54#define DMPF_REGION_SIZE 0X100
54struct raid_set { 55struct raid_set {
55 struct dm_target *ti; 56 struct dm_target *ti;
56 57
@@ -72,6 +73,7 @@ static struct raid_type {
72 const unsigned level; /* RAID level. */ 73 const unsigned level; /* RAID level. */
73 const unsigned algorithm; /* RAID algorithm. */ 74 const unsigned algorithm; /* RAID algorithm. */
74} raid_types[] = { 75} raid_types[] = {
76 {"raid1", "RAID1 (mirroring)", 0, 2, 1, 0 /* NONE */},
75 {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0}, 77 {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0},
76 {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC}, 78 {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC},
77 {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC}, 79 {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC},
@@ -105,7 +107,8 @@ static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *ra
105 } 107 }
106 108
107 sectors_per_dev = ti->len; 109 sectors_per_dev = ti->len;
108 if (sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) { 110 if ((raid_type->level > 1) &&
111 sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) {
109 ti->error = "Target length not divisible by number of data devices"; 112 ti->error = "Target length not divisible by number of data devices";
110 return ERR_PTR(-EINVAL); 113 return ERR_PTR(-EINVAL);
111 } 114 }
@@ -147,9 +150,16 @@ static void context_free(struct raid_set *rs)
147{ 150{
148 int i; 151 int i;
149 152
150 for (i = 0; i < rs->md.raid_disks; i++) 153 for (i = 0; i < rs->md.raid_disks; i++) {
154 if (rs->dev[i].meta_dev)
155 dm_put_device(rs->ti, rs->dev[i].meta_dev);
156 if (rs->dev[i].rdev.sb_page)
157 put_page(rs->dev[i].rdev.sb_page);
158 rs->dev[i].rdev.sb_page = NULL;
159 rs->dev[i].rdev.sb_loaded = 0;
151 if (rs->dev[i].data_dev) 160 if (rs->dev[i].data_dev)
152 dm_put_device(rs->ti, rs->dev[i].data_dev); 161 dm_put_device(rs->ti, rs->dev[i].data_dev);
162 }
153 163
154 kfree(rs); 164 kfree(rs);
155} 165}
@@ -159,7 +169,16 @@ static void context_free(struct raid_set *rs)
159 * <meta_dev>: meta device name or '-' if missing 169 * <meta_dev>: meta device name or '-' if missing
160 * <data_dev>: data device name or '-' if missing 170 * <data_dev>: data device name or '-' if missing
161 * 171 *
162 * This code parses those words. 172 * The following are permitted:
173 * - -
174 * - <data_dev>
175 * <meta_dev> <data_dev>
176 *
177 * The following is not allowed:
178 * <meta_dev> -
179 *
180 * This code parses those words. If there is a failure,
181 * the caller must use context_free to unwind the operations.
163 */ 182 */
164static int dev_parms(struct raid_set *rs, char **argv) 183static int dev_parms(struct raid_set *rs, char **argv)
165{ 184{
@@ -182,8 +201,16 @@ static int dev_parms(struct raid_set *rs, char **argv)
182 rs->dev[i].rdev.mddev = &rs->md; 201 rs->dev[i].rdev.mddev = &rs->md;
183 202
184 if (strcmp(argv[0], "-")) { 203 if (strcmp(argv[0], "-")) {
185 rs->ti->error = "Metadata devices not supported"; 204 ret = dm_get_device(rs->ti, argv[0],
186 return -EINVAL; 205 dm_table_get_mode(rs->ti->table),
206 &rs->dev[i].meta_dev);
207 rs->ti->error = "RAID metadata device lookup failure";
208 if (ret)
209 return ret;
210
211 rs->dev[i].rdev.sb_page = alloc_page(GFP_KERNEL);
212 if (!rs->dev[i].rdev.sb_page)
213 return -ENOMEM;
187 } 214 }
188 215
189 if (!strcmp(argv[1], "-")) { 216 if (!strcmp(argv[1], "-")) {
@@ -193,6 +220,10 @@ static int dev_parms(struct raid_set *rs, char **argv)
193 return -EINVAL; 220 return -EINVAL;
194 } 221 }
195 222
223 rs->ti->error = "No data device supplied with metadata device";
224 if (rs->dev[i].meta_dev)
225 return -EINVAL;
226
196 continue; 227 continue;
197 } 228 }
198 229
@@ -204,6 +235,10 @@ static int dev_parms(struct raid_set *rs, char **argv)
204 return ret; 235 return ret;
205 } 236 }
206 237
238 if (rs->dev[i].meta_dev) {
239 metadata_available = 1;
240 rs->dev[i].rdev.meta_bdev = rs->dev[i].meta_dev->bdev;
241 }
207 rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev; 242 rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev;
208 list_add(&rs->dev[i].rdev.same_set, &rs->md.disks); 243 list_add(&rs->dev[i].rdev.same_set, &rs->md.disks);
209 if (!test_bit(In_sync, &rs->dev[i].rdev.flags)) 244 if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
@@ -235,33 +270,109 @@ static int dev_parms(struct raid_set *rs, char **argv)
235} 270}
236 271
237/* 272/*
273 * validate_region_size
274 * @rs
275 * @region_size: region size in sectors. If 0, pick a size (4MiB default).
276 *
277 * Set rs->md.bitmap_info.chunksize (which really refers to 'region size').
278 * Ensure that (ti->len/region_size < 2^21) - required by MD bitmap.
279 *
280 * Returns: 0 on success, -EINVAL on failure.
281 */
282static int validate_region_size(struct raid_set *rs, unsigned long region_size)
283{
284 unsigned long min_region_size = rs->ti->len / (1 << 21);
285
286 if (!region_size) {
287 /*
288 * Choose a reasonable default. All figures in sectors.
289 */
290 if (min_region_size > (1 << 13)) {
291 DMINFO("Choosing default region size of %lu sectors",
292 region_size);
293 region_size = min_region_size;
294 } else {
295 DMINFO("Choosing default region size of 4MiB");
296 region_size = 1 << 13; /* sectors */
297 }
298 } else {
299 /*
300 * Validate user-supplied value.
301 */
302 if (region_size > rs->ti->len) {
303 rs->ti->error = "Supplied region size is too large";
304 return -EINVAL;
305 }
306
307 if (region_size < min_region_size) {
308 DMERR("Supplied region_size (%lu sectors) below minimum (%lu)",
309 region_size, min_region_size);
310 rs->ti->error = "Supplied region size is too small";
311 return -EINVAL;
312 }
313
314 if (!is_power_of_2(region_size)) {
315 rs->ti->error = "Region size is not a power of 2";
316 return -EINVAL;
317 }
318
319 if (region_size < rs->md.chunk_sectors) {
320 rs->ti->error = "Region size is smaller than the chunk size";
321 return -EINVAL;
322 }
323 }
324
325 /*
326 * Convert sectors to bytes.
327 */
328 rs->md.bitmap_info.chunksize = (region_size << 9);
329
330 return 0;
331}
332
333/*
238 * Possible arguments are... 334 * Possible arguments are...
239 * RAID456:
240 * <chunk_size> [optional_args] 335 * <chunk_size> [optional_args]
241 * 336 *
242 * Optional args: 337 * Argument definitions
243 * [[no]sync] Force or prevent recovery of the entire array 338 * <chunk_size> The number of sectors per disk that
339 * will form the "stripe"
340 * [[no]sync] Force or prevent recovery of the
341 * entire array
244 * [rebuild <idx>] Rebuild the drive indicated by the index 342 * [rebuild <idx>] Rebuild the drive indicated by the index
245 * [daemon_sleep <ms>] Time between bitmap daemon work to clear bits 343 * [daemon_sleep <ms>] Time between bitmap daemon work to
344 * clear bits
246 * [min_recovery_rate <kB/sec/disk>] Throttle RAID initialization 345 * [min_recovery_rate <kB/sec/disk>] Throttle RAID initialization
247 * [max_recovery_rate <kB/sec/disk>] Throttle RAID initialization 346 * [max_recovery_rate <kB/sec/disk>] Throttle RAID initialization
347 * [write_mostly <idx>] Indicate a write mostly drive via index
248 * [max_write_behind <sectors>] See '-write-behind=' (man mdadm) 348 * [max_write_behind <sectors>] See '-write-behind=' (man mdadm)
249 * [stripe_cache <sectors>] Stripe cache size for higher RAIDs 349 * [stripe_cache <sectors>] Stripe cache size for higher RAIDs
350 * [region_size <sectors>] Defines granularity of bitmap
250 */ 351 */
251static int parse_raid_params(struct raid_set *rs, char **argv, 352static int parse_raid_params(struct raid_set *rs, char **argv,
252 unsigned num_raid_params) 353 unsigned num_raid_params)
253{ 354{
254 unsigned i, rebuild_cnt = 0; 355 unsigned i, rebuild_cnt = 0;
255 unsigned long value; 356 unsigned long value, region_size = 0;
256 char *key; 357 char *key;
257 358
258 /* 359 /*
259 * First, parse the in-order required arguments 360 * First, parse the in-order required arguments
361 * "chunk_size" is the only argument of this type.
260 */ 362 */
261 if ((strict_strtoul(argv[0], 10, &value) < 0) || 363 if ((strict_strtoul(argv[0], 10, &value) < 0)) {
262 !is_power_of_2(value) || (value < 8)) {
263 rs->ti->error = "Bad chunk size"; 364 rs->ti->error = "Bad chunk size";
264 return -EINVAL; 365 return -EINVAL;
366 } else if (rs->raid_type->level == 1) {
367 if (value)
368 DMERR("Ignoring chunk size parameter for RAID 1");
369 value = 0;
370 } else if (!is_power_of_2(value)) {
371 rs->ti->error = "Chunk size must be a power of 2";
372 return -EINVAL;
373 } else if (value < 8) {
374 rs->ti->error = "Chunk size value is too small";
375 return -EINVAL;
265 } 376 }
266 377
267 rs->md.new_chunk_sectors = rs->md.chunk_sectors = value; 378 rs->md.new_chunk_sectors = rs->md.chunk_sectors = value;
@@ -269,22 +380,39 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
269 num_raid_params--; 380 num_raid_params--;
270 381
271 /* 382 /*
272 * Second, parse the unordered optional arguments 383 * We set each individual device as In_sync with a completed
384 * 'recovery_offset'. If there has been a device failure or
385 * replacement then one of the following cases applies:
386 *
387 * 1) User specifies 'rebuild'.
388 * - Device is reset when param is read.
389 * 2) A new device is supplied.
390 * - No matching superblock found, resets device.
391 * 3) Device failure was transient and returns on reload.
392 * - Failure noticed, resets device for bitmap replay.
393 * 4) Device hadn't completed recovery after previous failure.
394 * - Superblock is read and overrides recovery_offset.
395 *
396 * What is found in the superblocks of the devices is always
397 * authoritative, unless 'rebuild' or '[no]sync' was specified.
273 */ 398 */
274 for (i = 0; i < rs->md.raid_disks; i++) 399 for (i = 0; i < rs->md.raid_disks; i++) {
275 set_bit(In_sync, &rs->dev[i].rdev.flags); 400 set_bit(In_sync, &rs->dev[i].rdev.flags);
401 rs->dev[i].rdev.recovery_offset = MaxSector;
402 }
276 403
404 /*
405 * Second, parse the unordered optional arguments
406 */
277 for (i = 0; i < num_raid_params; i++) { 407 for (i = 0; i < num_raid_params; i++) {
278 if (!strcmp(argv[i], "nosync")) { 408 if (!strcasecmp(argv[i], "nosync")) {
279 rs->md.recovery_cp = MaxSector; 409 rs->md.recovery_cp = MaxSector;
280 rs->print_flags |= DMPF_NOSYNC; 410 rs->print_flags |= DMPF_NOSYNC;
281 rs->md.flags |= MD_SYNC_STATE_FORCED;
282 continue; 411 continue;
283 } 412 }
284 if (!strcmp(argv[i], "sync")) { 413 if (!strcasecmp(argv[i], "sync")) {
285 rs->md.recovery_cp = 0; 414 rs->md.recovery_cp = 0;
286 rs->print_flags |= DMPF_SYNC; 415 rs->print_flags |= DMPF_SYNC;
287 rs->md.flags |= MD_SYNC_STATE_FORCED;
288 continue; 416 continue;
289 } 417 }
290 418
@@ -300,9 +428,13 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
300 return -EINVAL; 428 return -EINVAL;
301 } 429 }
302 430
303 if (!strcmp(key, "rebuild")) { 431 if (!strcasecmp(key, "rebuild")) {
304 if (++rebuild_cnt > rs->raid_type->parity_devs) { 432 rebuild_cnt++;
305 rs->ti->error = "Too many rebuild drives given"; 433 if (((rs->raid_type->level != 1) &&
434 (rebuild_cnt > rs->raid_type->parity_devs)) ||
435 ((rs->raid_type->level == 1) &&
436 (rebuild_cnt > (rs->md.raid_disks - 1)))) {
437 rs->ti->error = "Too many rebuild devices specified for given RAID type";
306 return -EINVAL; 438 return -EINVAL;
307 } 439 }
308 if (value > rs->md.raid_disks) { 440 if (value > rs->md.raid_disks) {
@@ -311,7 +443,22 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
311 } 443 }
312 clear_bit(In_sync, &rs->dev[value].rdev.flags); 444 clear_bit(In_sync, &rs->dev[value].rdev.flags);
313 rs->dev[value].rdev.recovery_offset = 0; 445 rs->dev[value].rdev.recovery_offset = 0;
314 } else if (!strcmp(key, "max_write_behind")) { 446 rs->print_flags |= DMPF_REBUILD;
447 } else if (!strcasecmp(key, "write_mostly")) {
448 if (rs->raid_type->level != 1) {
449 rs->ti->error = "write_mostly option is only valid for RAID1";
450 return -EINVAL;
451 }
452 if (value >= rs->md.raid_disks) {
453 rs->ti->error = "Invalid write_mostly drive index given";
454 return -EINVAL;
455 }
456 set_bit(WriteMostly, &rs->dev[value].rdev.flags);
457 } else if (!strcasecmp(key, "max_write_behind")) {
458 if (rs->raid_type->level != 1) {
459 rs->ti->error = "max_write_behind option is only valid for RAID1";
460 return -EINVAL;
461 }
315 rs->print_flags |= DMPF_MAX_WRITE_BEHIND; 462 rs->print_flags |= DMPF_MAX_WRITE_BEHIND;
316 463
317 /* 464 /*
@@ -324,14 +471,14 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
324 return -EINVAL; 471 return -EINVAL;
325 } 472 }
326 rs->md.bitmap_info.max_write_behind = value; 473 rs->md.bitmap_info.max_write_behind = value;
327 } else if (!strcmp(key, "daemon_sleep")) { 474 } else if (!strcasecmp(key, "daemon_sleep")) {
328 rs->print_flags |= DMPF_DAEMON_SLEEP; 475 rs->print_flags |= DMPF_DAEMON_SLEEP;
329 if (!value || (value > MAX_SCHEDULE_TIMEOUT)) { 476 if (!value || (value > MAX_SCHEDULE_TIMEOUT)) {
330 rs->ti->error = "daemon sleep period out of range"; 477 rs->ti->error = "daemon sleep period out of range";
331 return -EINVAL; 478 return -EINVAL;
332 } 479 }
333 rs->md.bitmap_info.daemon_sleep = value; 480 rs->md.bitmap_info.daemon_sleep = value;
334 } else if (!strcmp(key, "stripe_cache")) { 481 } else if (!strcasecmp(key, "stripe_cache")) {
335 rs->print_flags |= DMPF_STRIPE_CACHE; 482 rs->print_flags |= DMPF_STRIPE_CACHE;
336 483
337 /* 484 /*
@@ -348,20 +495,23 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
348 rs->ti->error = "Bad stripe_cache size"; 495 rs->ti->error = "Bad stripe_cache size";
349 return -EINVAL; 496 return -EINVAL;
350 } 497 }
351 } else if (!strcmp(key, "min_recovery_rate")) { 498 } else if (!strcasecmp(key, "min_recovery_rate")) {
352 rs->print_flags |= DMPF_MIN_RECOVERY_RATE; 499 rs->print_flags |= DMPF_MIN_RECOVERY_RATE;
353 if (value > INT_MAX) { 500 if (value > INT_MAX) {
354 rs->ti->error = "min_recovery_rate out of range"; 501 rs->ti->error = "min_recovery_rate out of range";
355 return -EINVAL; 502 return -EINVAL;
356 } 503 }
357 rs->md.sync_speed_min = (int)value; 504 rs->md.sync_speed_min = (int)value;
358 } else if (!strcmp(key, "max_recovery_rate")) { 505 } else if (!strcasecmp(key, "max_recovery_rate")) {
359 rs->print_flags |= DMPF_MAX_RECOVERY_RATE; 506 rs->print_flags |= DMPF_MAX_RECOVERY_RATE;
360 if (value > INT_MAX) { 507 if (value > INT_MAX) {
361 rs->ti->error = "max_recovery_rate out of range"; 508 rs->ti->error = "max_recovery_rate out of range";
362 return -EINVAL; 509 return -EINVAL;
363 } 510 }
364 rs->md.sync_speed_max = (int)value; 511 rs->md.sync_speed_max = (int)value;
512 } else if (!strcasecmp(key, "region_size")) {
513 rs->print_flags |= DMPF_REGION_SIZE;
514 region_size = value;
365 } else { 515 } else {
366 DMERR("Unable to parse RAID parameter: %s", key); 516 DMERR("Unable to parse RAID parameter: %s", key);
367 rs->ti->error = "Unable to parse RAID parameters"; 517 rs->ti->error = "Unable to parse RAID parameters";
@@ -369,6 +519,19 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
369 } 519 }
370 } 520 }
371 521
522 if (validate_region_size(rs, region_size))
523 return -EINVAL;
524
525 if (rs->md.chunk_sectors)
526 rs->ti->split_io = rs->md.chunk_sectors;
527 else
528 rs->ti->split_io = region_size;
529
530 if (rs->md.chunk_sectors)
531 rs->ti->split_io = rs->md.chunk_sectors;
532 else
533 rs->ti->split_io = region_size;
534
372 /* Assume there are no metadata devices until the drives are parsed */ 535 /* Assume there are no metadata devices until the drives are parsed */
373 rs->md.persistent = 0; 536 rs->md.persistent = 0;
374 rs->md.external = 1; 537 rs->md.external = 1;
@@ -387,17 +550,351 @@ static int raid_is_congested(struct dm_target_callbacks *cb, int bits)
387{ 550{
388 struct raid_set *rs = container_of(cb, struct raid_set, callbacks); 551 struct raid_set *rs = container_of(cb, struct raid_set, callbacks);
389 552
553 if (rs->raid_type->level == 1)
554 return md_raid1_congested(&rs->md, bits);
555
390 return md_raid5_congested(&rs->md, bits); 556 return md_raid5_congested(&rs->md, bits);
391} 557}
392 558
393/* 559/*
560 * This structure is never routinely used by userspace, unlike md superblocks.
561 * Devices with this superblock should only ever be accessed via device-mapper.
562 */
563#define DM_RAID_MAGIC 0x64526D44
564struct dm_raid_superblock {
565 __le32 magic; /* "DmRd" */
566 __le32 features; /* Used to indicate possible future changes */
567
568 __le32 num_devices; /* Number of devices in this array. (Max 64) */
569 __le32 array_position; /* The position of this drive in the array */
570
571 __le64 events; /* Incremented by md when superblock updated */
572 __le64 failed_devices; /* Bit field of devices to indicate failures */
573
574 /*
575 * This offset tracks the progress of the repair or replacement of
576 * an individual drive.
577 */
578 __le64 disk_recovery_offset;
579
580 /*
581 * This offset tracks the progress of the initial array
582 * synchronisation/parity calculation.
583 */
584 __le64 array_resync_offset;
585
586 /*
587 * RAID characteristics
588 */
589 __le32 level;
590 __le32 layout;
591 __le32 stripe_sectors;
592
593 __u8 pad[452]; /* Round struct to 512 bytes. */
594 /* Always set to 0 when writing. */
595} __packed;
596
597static int read_disk_sb(mdk_rdev_t *rdev, int size)
598{
599 BUG_ON(!rdev->sb_page);
600
601 if (rdev->sb_loaded)
602 return 0;
603
604 if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) {
605 DMERR("Failed to read device superblock");
606 return -EINVAL;
607 }
608
609 rdev->sb_loaded = 1;
610
611 return 0;
612}
613
614static void super_sync(mddev_t *mddev, mdk_rdev_t *rdev)
615{
616 mdk_rdev_t *r, *t;
617 uint64_t failed_devices;
618 struct dm_raid_superblock *sb;
619
620 sb = page_address(rdev->sb_page);
621 failed_devices = le64_to_cpu(sb->failed_devices);
622
623 rdev_for_each(r, t, mddev)
624 if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags))
625 failed_devices |= (1ULL << r->raid_disk);
626
627 memset(sb, 0, sizeof(*sb));
628
629 sb->magic = cpu_to_le32(DM_RAID_MAGIC);
630 sb->features = cpu_to_le32(0); /* No features yet */
631
632 sb->num_devices = cpu_to_le32(mddev->raid_disks);
633 sb->array_position = cpu_to_le32(rdev->raid_disk);
634
635 sb->events = cpu_to_le64(mddev->events);
636 sb->failed_devices = cpu_to_le64(failed_devices);
637
638 sb->disk_recovery_offset = cpu_to_le64(rdev->recovery_offset);
639 sb->array_resync_offset = cpu_to_le64(mddev->recovery_cp);
640
641 sb->level = cpu_to_le32(mddev->level);
642 sb->layout = cpu_to_le32(mddev->layout);
643 sb->stripe_sectors = cpu_to_le32(mddev->chunk_sectors);
644}
645
646/*
647 * super_load
648 *
649 * This function creates a superblock if one is not found on the device
650 * and will decide which superblock to use if there's a choice.
651 *
652 * Return: 1 if use rdev, 0 if use refdev, -Exxx otherwise
653 */
654static int super_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev)
655{
656 int ret;
657 struct dm_raid_superblock *sb;
658 struct dm_raid_superblock *refsb;
659 uint64_t events_sb, events_refsb;
660
661 rdev->sb_start = 0;
662 rdev->sb_size = sizeof(*sb);
663
664 ret = read_disk_sb(rdev, rdev->sb_size);
665 if (ret)
666 return ret;
667
668 sb = page_address(rdev->sb_page);
669 if (sb->magic != cpu_to_le32(DM_RAID_MAGIC)) {
670 super_sync(rdev->mddev, rdev);
671
672 set_bit(FirstUse, &rdev->flags);
673
674 /* Force writing of superblocks to disk */
675 set_bit(MD_CHANGE_DEVS, &rdev->mddev->flags);
676
677 /* Any superblock is better than none, choose that if given */
678 return refdev ? 0 : 1;
679 }
680
681 if (!refdev)
682 return 1;
683
684 events_sb = le64_to_cpu(sb->events);
685
686 refsb = page_address(refdev->sb_page);
687 events_refsb = le64_to_cpu(refsb->events);
688
689 return (events_sb > events_refsb) ? 1 : 0;
690}
691
692static int super_init_validation(mddev_t *mddev, mdk_rdev_t *rdev)
693{
694 int role;
695 struct raid_set *rs = container_of(mddev, struct raid_set, md);
696 uint64_t events_sb;
697 uint64_t failed_devices;
698 struct dm_raid_superblock *sb;
699 uint32_t new_devs = 0;
700 uint32_t rebuilds = 0;
701 mdk_rdev_t *r, *t;
702 struct dm_raid_superblock *sb2;
703
704 sb = page_address(rdev->sb_page);
705 events_sb = le64_to_cpu(sb->events);
706 failed_devices = le64_to_cpu(sb->failed_devices);
707
708 /*
709 * Initialise to 1 if this is a new superblock.
710 */
711 mddev->events = events_sb ? : 1;
712
713 /*
714 * Reshaping is not currently allowed
715 */
716 if ((le32_to_cpu(sb->level) != mddev->level) ||
717 (le32_to_cpu(sb->layout) != mddev->layout) ||
718 (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors)) {
719 DMERR("Reshaping arrays not yet supported.");
720 return -EINVAL;
721 }
722
723 /* We can only change the number of devices in RAID1 right now */
724 if ((rs->raid_type->level != 1) &&
725 (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) {
726 DMERR("Reshaping arrays not yet supported.");
727 return -EINVAL;
728 }
729
730 if (!(rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC)))
731 mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset);
732
733 /*
734 * During load, we set FirstUse if a new superblock was written.
735 * There are two reasons we might not have a superblock:
736 * 1) The array is brand new - in which case, all of the
737 * devices must have their In_sync bit set. Also,
738 * recovery_cp must be 0, unless forced.
739 * 2) This is a new device being added to an old array
740 * and the new device needs to be rebuilt - in which
741 * case the In_sync bit will /not/ be set and
742 * recovery_cp must be MaxSector.
743 */
744 rdev_for_each(r, t, mddev) {
745 if (!test_bit(In_sync, &r->flags)) {
746 if (!test_bit(FirstUse, &r->flags))
747 DMERR("Superblock area of "
748 "rebuild device %d should have been "
749 "cleared.", r->raid_disk);
750 set_bit(FirstUse, &r->flags);
751 rebuilds++;
752 } else if (test_bit(FirstUse, &r->flags))
753 new_devs++;
754 }
755
756 if (!rebuilds) {
757 if (new_devs == mddev->raid_disks) {
758 DMINFO("Superblocks created for new array");
759 set_bit(MD_ARRAY_FIRST_USE, &mddev->flags);
760 } else if (new_devs) {
761 DMERR("New device injected "
762 "into existing array without 'rebuild' "
763 "parameter specified");
764 return -EINVAL;
765 }
766 } else if (new_devs) {
767 DMERR("'rebuild' devices cannot be "
768 "injected into an array with other first-time devices");
769 return -EINVAL;
770 } else if (mddev->recovery_cp != MaxSector) {
771 DMERR("'rebuild' specified while array is not in-sync");
772 return -EINVAL;
773 }
774
775 /*
776 * Now we set the Faulty bit for those devices that are
777 * recorded in the superblock as failed.
778 */
779 rdev_for_each(r, t, mddev) {
780 if (!r->sb_page)
781 continue;
782 sb2 = page_address(r->sb_page);
783 sb2->failed_devices = 0;
784
785 /*
786 * Check for any device re-ordering.
787 */
788 if (!test_bit(FirstUse, &r->flags) && (r->raid_disk >= 0)) {
789 role = le32_to_cpu(sb2->array_position);
790 if (role != r->raid_disk) {
791 if (rs->raid_type->level != 1) {
792 rs->ti->error = "Cannot change device "
793 "positions in RAID array";
794 return -EINVAL;
795 }
796 DMINFO("RAID1 device #%d now at position #%d",
797 role, r->raid_disk);
798 }
799
800 /*
801 * Partial recovery is performed on
802 * returning failed devices.
803 */
804 if (failed_devices & (1 << role))
805 set_bit(Faulty, &r->flags);
806 }
807 }
808
809 return 0;
810}
811
812static int super_validate(mddev_t *mddev, mdk_rdev_t *rdev)
813{
814 struct dm_raid_superblock *sb = page_address(rdev->sb_page);
815
816 /*
817 * If mddev->events is not set, we know we have not yet initialized
818 * the array.
819 */
820 if (!mddev->events && super_init_validation(mddev, rdev))
821 return -EINVAL;
822
823 mddev->bitmap_info.offset = 4096 >> 9; /* Enable bitmap creation */
824 rdev->mddev->bitmap_info.default_offset = 4096 >> 9;
825 if (!test_bit(FirstUse, &rdev->flags)) {
826 rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset);
827 if (rdev->recovery_offset != MaxSector)
828 clear_bit(In_sync, &rdev->flags);
829 }
830
831 /*
832 * If a device comes back, set it as not In_sync and no longer faulty.
833 */
834 if (test_bit(Faulty, &rdev->flags)) {
835 clear_bit(Faulty, &rdev->flags);
836 clear_bit(In_sync, &rdev->flags);
837 rdev->saved_raid_disk = rdev->raid_disk;
838 rdev->recovery_offset = 0;
839 }
840
841 clear_bit(FirstUse, &rdev->flags);
842
843 return 0;
844}
845
846/*
847 * Analyse superblocks and select the freshest.
848 */
849static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
850{
851 int ret;
852 mdk_rdev_t *rdev, *freshest, *tmp;
853 mddev_t *mddev = &rs->md;
854
855 freshest = NULL;
856 rdev_for_each(rdev, tmp, mddev) {
857 if (!rdev->meta_bdev)
858 continue;
859
860 ret = super_load(rdev, freshest);
861
862 switch (ret) {
863 case 1:
864 freshest = rdev;
865 break;
866 case 0:
867 break;
868 default:
869 ti->error = "Failed to load superblock";
870 return ret;
871 }
872 }
873
874 if (!freshest)
875 return 0;
876
877 /*
878 * Validation of the freshest device provides the source of
879 * validation for the remaining devices.
880 */
881 ti->error = "Unable to assemble array: Invalid superblocks";
882 if (super_validate(mddev, freshest))
883 return -EINVAL;
884
885 rdev_for_each(rdev, tmp, mddev)
886 if ((rdev != freshest) && super_validate(mddev, rdev))
887 return -EINVAL;
888
889 return 0;
890}
891
892/*
394 * Construct a RAID4/5/6 mapping: 893 * Construct a RAID4/5/6 mapping:
395 * Args: 894 * Args:
396 * <raid_type> <#raid_params> <raid_params> \ 895 * <raid_type> <#raid_params> <raid_params> \
397 * <#raid_devs> { <meta_dev1> <dev1> .. <meta_devN> <devN> } 896 * <#raid_devs> { <meta_dev1> <dev1> .. <meta_devN> <devN> }
398 * 897 *
399 * ** metadata devices are not supported yet, use '-' instead **
400 *
401 * <raid_params> varies by <raid_type>. See 'parse_raid_params' for 898 * <raid_params> varies by <raid_type>. See 'parse_raid_params' for
402 * details on possible <raid_params>. 899 * details on possible <raid_params>.
403 */ 900 */
@@ -465,8 +962,12 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
465 if (ret) 962 if (ret)
466 goto bad; 963 goto bad;
467 964
965 rs->md.sync_super = super_sync;
966 ret = analyse_superblocks(ti, rs);
967 if (ret)
968 goto bad;
969
468 INIT_WORK(&rs->md.event_work, do_table_event); 970 INIT_WORK(&rs->md.event_work, do_table_event);
469 ti->split_io = rs->md.chunk_sectors;
470 ti->private = rs; 971 ti->private = rs;
471 972
472 mutex_lock(&rs->md.reconfig_mutex); 973 mutex_lock(&rs->md.reconfig_mutex);
@@ -482,6 +983,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
482 rs->callbacks.congested_fn = raid_is_congested; 983 rs->callbacks.congested_fn = raid_is_congested;
483 dm_table_add_target_callbacks(ti->table, &rs->callbacks); 984 dm_table_add_target_callbacks(ti->table, &rs->callbacks);
484 985
986 mddev_suspend(&rs->md);
485 return 0; 987 return 0;
486 988
487bad: 989bad:
@@ -546,12 +1048,17 @@ static int raid_status(struct dm_target *ti, status_type_t type,
546 break; 1048 break;
547 case STATUSTYPE_TABLE: 1049 case STATUSTYPE_TABLE:
548 /* The string you would use to construct this array */ 1050 /* The string you would use to construct this array */
549 for (i = 0; i < rs->md.raid_disks; i++) 1051 for (i = 0; i < rs->md.raid_disks; i++) {
550 if (rs->dev[i].data_dev && 1052 if ((rs->print_flags & DMPF_REBUILD) &&
1053 rs->dev[i].data_dev &&
551 !test_bit(In_sync, &rs->dev[i].rdev.flags)) 1054 !test_bit(In_sync, &rs->dev[i].rdev.flags))
552 raid_param_cnt++; /* for rebuilds */ 1055 raid_param_cnt += 2; /* for rebuilds */
1056 if (rs->dev[i].data_dev &&
1057 test_bit(WriteMostly, &rs->dev[i].rdev.flags))
1058 raid_param_cnt += 2;
1059 }
553 1060
554 raid_param_cnt += (hweight64(rs->print_flags) * 2); 1061 raid_param_cnt += (hweight64(rs->print_flags & ~DMPF_REBUILD) * 2);
555 if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC)) 1062 if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC))
556 raid_param_cnt--; 1063 raid_param_cnt--;
557 1064
@@ -565,7 +1072,8 @@ static int raid_status(struct dm_target *ti, status_type_t type,
565 DMEMIT(" nosync"); 1072 DMEMIT(" nosync");
566 1073
567 for (i = 0; i < rs->md.raid_disks; i++) 1074 for (i = 0; i < rs->md.raid_disks; i++)
568 if (rs->dev[i].data_dev && 1075 if ((rs->print_flags & DMPF_REBUILD) &&
1076 rs->dev[i].data_dev &&
569 !test_bit(In_sync, &rs->dev[i].rdev.flags)) 1077 !test_bit(In_sync, &rs->dev[i].rdev.flags))
570 DMEMIT(" rebuild %u", i); 1078 DMEMIT(" rebuild %u", i);
571 1079
@@ -579,6 +1087,11 @@ static int raid_status(struct dm_target *ti, status_type_t type,
579 if (rs->print_flags & DMPF_MAX_RECOVERY_RATE) 1087 if (rs->print_flags & DMPF_MAX_RECOVERY_RATE)
580 DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max); 1088 DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max);
581 1089
1090 for (i = 0; i < rs->md.raid_disks; i++)
1091 if (rs->dev[i].data_dev &&
1092 test_bit(WriteMostly, &rs->dev[i].rdev.flags))
1093 DMEMIT(" write_mostly %u", i);
1094
582 if (rs->print_flags & DMPF_MAX_WRITE_BEHIND) 1095 if (rs->print_flags & DMPF_MAX_WRITE_BEHIND)
583 DMEMIT(" max_write_behind %lu", 1096 DMEMIT(" max_write_behind %lu",
584 rs->md.bitmap_info.max_write_behind); 1097 rs->md.bitmap_info.max_write_behind);
@@ -591,9 +1104,16 @@ static int raid_status(struct dm_target *ti, status_type_t type,
591 conf ? conf->max_nr_stripes * 2 : 0); 1104 conf ? conf->max_nr_stripes * 2 : 0);
592 } 1105 }
593 1106
1107 if (rs->print_flags & DMPF_REGION_SIZE)
1108 DMEMIT(" region_size %lu",
1109 rs->md.bitmap_info.chunksize >> 9);
1110
594 DMEMIT(" %d", rs->md.raid_disks); 1111 DMEMIT(" %d", rs->md.raid_disks);
595 for (i = 0; i < rs->md.raid_disks; i++) { 1112 for (i = 0; i < rs->md.raid_disks; i++) {
596 DMEMIT(" -"); /* metadata device */ 1113 if (rs->dev[i].meta_dev)
1114 DMEMIT(" %s", rs->dev[i].meta_dev->name);
1115 else
1116 DMEMIT(" -");
597 1117
598 if (rs->dev[i].data_dev) 1118 if (rs->dev[i].data_dev)
599 DMEMIT(" %s", rs->dev[i].data_dev->name); 1119 DMEMIT(" %s", rs->dev[i].data_dev->name);
@@ -650,12 +1170,13 @@ static void raid_resume(struct dm_target *ti)
650{ 1170{
651 struct raid_set *rs = ti->private; 1171 struct raid_set *rs = ti->private;
652 1172
1173 bitmap_load(&rs->md);
653 mddev_resume(&rs->md); 1174 mddev_resume(&rs->md);
654} 1175}
655 1176
656static struct target_type raid_target = { 1177static struct target_type raid_target = {
657 .name = "raid", 1178 .name = "raid",
658 .version = {1, 0, 0}, 1179 .version = {1, 1, 0},
659 .module = THIS_MODULE, 1180 .module = THIS_MODULE,
660 .ctr = raid_ctr, 1181 .ctr = raid_ctr,
661 .dtr = raid_dtr, 1182 .dtr = raid_dtr,
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 135c2f1fdbf..d1f1d701710 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -58,25 +58,30 @@
58#define NUM_SNAPSHOT_HDR_CHUNKS 1 58#define NUM_SNAPSHOT_HDR_CHUNKS 1
59 59
60struct disk_header { 60struct disk_header {
61 uint32_t magic; 61 __le32 magic;
62 62
63 /* 63 /*
64 * Is this snapshot valid. There is no way of recovering 64 * Is this snapshot valid. There is no way of recovering
65 * an invalid snapshot. 65 * an invalid snapshot.
66 */ 66 */
67 uint32_t valid; 67 __le32 valid;
68 68
69 /* 69 /*
70 * Simple, incrementing version. no backward 70 * Simple, incrementing version. no backward
71 * compatibility. 71 * compatibility.
72 */ 72 */
73 uint32_t version; 73 __le32 version;
74 74
75 /* In sectors */ 75 /* In sectors */
76 uint32_t chunk_size; 76 __le32 chunk_size;
77}; 77} __packed;
78 78
79struct disk_exception { 79struct disk_exception {
80 __le64 old_chunk;
81 __le64 new_chunk;
82} __packed;
83
84struct core_exception {
80 uint64_t old_chunk; 85 uint64_t old_chunk;
81 uint64_t new_chunk; 86 uint64_t new_chunk;
82}; 87};
@@ -169,10 +174,9 @@ static int alloc_area(struct pstore *ps)
169 if (!ps->area) 174 if (!ps->area)
170 goto err_area; 175 goto err_area;
171 176
172 ps->zero_area = vmalloc(len); 177 ps->zero_area = vzalloc(len);
173 if (!ps->zero_area) 178 if (!ps->zero_area)
174 goto err_zero_area; 179 goto err_zero_area;
175 memset(ps->zero_area, 0, len);
176 180
177 ps->header_area = vmalloc(len); 181 ps->header_area = vmalloc(len);
178 if (!ps->header_area) 182 if (!ps->header_area)
@@ -396,32 +400,32 @@ static struct disk_exception *get_exception(struct pstore *ps, uint32_t index)
396} 400}
397 401
398static void read_exception(struct pstore *ps, 402static void read_exception(struct pstore *ps,
399 uint32_t index, struct disk_exception *result) 403 uint32_t index, struct core_exception *result)
400{ 404{
401 struct disk_exception *e = get_exception(ps, index); 405 struct disk_exception *de = get_exception(ps, index);
402 406
403 /* copy it */ 407 /* copy it */
404 result->old_chunk = le64_to_cpu(e->old_chunk); 408 result->old_chunk = le64_to_cpu(de->old_chunk);
405 result->new_chunk = le64_to_cpu(e->new_chunk); 409 result->new_chunk = le64_to_cpu(de->new_chunk);
406} 410}
407 411
408static void write_exception(struct pstore *ps, 412static void write_exception(struct pstore *ps,
409 uint32_t index, struct disk_exception *de) 413 uint32_t index, struct core_exception *e)
410{ 414{
411 struct disk_exception *e = get_exception(ps, index); 415 struct disk_exception *de = get_exception(ps, index);
412 416
413 /* copy it */ 417 /* copy it */
414 e->old_chunk = cpu_to_le64(de->old_chunk); 418 de->old_chunk = cpu_to_le64(e->old_chunk);
415 e->new_chunk = cpu_to_le64(de->new_chunk); 419 de->new_chunk = cpu_to_le64(e->new_chunk);
416} 420}
417 421
418static void clear_exception(struct pstore *ps, uint32_t index) 422static void clear_exception(struct pstore *ps, uint32_t index)
419{ 423{
420 struct disk_exception *e = get_exception(ps, index); 424 struct disk_exception *de = get_exception(ps, index);
421 425
422 /* clear it */ 426 /* clear it */
423 e->old_chunk = 0; 427 de->old_chunk = 0;
424 e->new_chunk = 0; 428 de->new_chunk = 0;
425} 429}
426 430
427/* 431/*
@@ -437,13 +441,13 @@ static int insert_exceptions(struct pstore *ps,
437{ 441{
438 int r; 442 int r;
439 unsigned int i; 443 unsigned int i;
440 struct disk_exception de; 444 struct core_exception e;
441 445
442 /* presume the area is full */ 446 /* presume the area is full */
443 *full = 1; 447 *full = 1;
444 448
445 for (i = 0; i < ps->exceptions_per_area; i++) { 449 for (i = 0; i < ps->exceptions_per_area; i++) {
446 read_exception(ps, i, &de); 450 read_exception(ps, i, &e);
447 451
448 /* 452 /*
449 * If the new_chunk is pointing at the start of 453 * If the new_chunk is pointing at the start of
@@ -451,7 +455,7 @@ static int insert_exceptions(struct pstore *ps,
451 * is we know that we've hit the end of the 455 * is we know that we've hit the end of the
452 * exceptions. Therefore the area is not full. 456 * exceptions. Therefore the area is not full.
453 */ 457 */
454 if (de.new_chunk == 0LL) { 458 if (e.new_chunk == 0LL) {
455 ps->current_committed = i; 459 ps->current_committed = i;
456 *full = 0; 460 *full = 0;
457 break; 461 break;
@@ -460,13 +464,13 @@ static int insert_exceptions(struct pstore *ps,
460 /* 464 /*
461 * Keep track of the start of the free chunks. 465 * Keep track of the start of the free chunks.
462 */ 466 */
463 if (ps->next_free <= de.new_chunk) 467 if (ps->next_free <= e.new_chunk)
464 ps->next_free = de.new_chunk + 1; 468 ps->next_free = e.new_chunk + 1;
465 469
466 /* 470 /*
467 * Otherwise we add the exception to the snapshot. 471 * Otherwise we add the exception to the snapshot.
468 */ 472 */
469 r = callback(callback_context, de.old_chunk, de.new_chunk); 473 r = callback(callback_context, e.old_chunk, e.new_chunk);
470 if (r) 474 if (r)
471 return r; 475 return r;
472 } 476 }
@@ -563,7 +567,7 @@ static int persistent_read_metadata(struct dm_exception_store *store,
563 ps->exceptions_per_area = (ps->store->chunk_size << SECTOR_SHIFT) / 567 ps->exceptions_per_area = (ps->store->chunk_size << SECTOR_SHIFT) /
564 sizeof(struct disk_exception); 568 sizeof(struct disk_exception);
565 ps->callbacks = dm_vcalloc(ps->exceptions_per_area, 569 ps->callbacks = dm_vcalloc(ps->exceptions_per_area,
566 sizeof(*ps->callbacks)); 570 sizeof(*ps->callbacks));
567 if (!ps->callbacks) 571 if (!ps->callbacks)
568 return -ENOMEM; 572 return -ENOMEM;
569 573
@@ -641,12 +645,12 @@ static void persistent_commit_exception(struct dm_exception_store *store,
641{ 645{
642 unsigned int i; 646 unsigned int i;
643 struct pstore *ps = get_info(store); 647 struct pstore *ps = get_info(store);
644 struct disk_exception de; 648 struct core_exception ce;
645 struct commit_callback *cb; 649 struct commit_callback *cb;
646 650
647 de.old_chunk = e->old_chunk; 651 ce.old_chunk = e->old_chunk;
648 de.new_chunk = e->new_chunk; 652 ce.new_chunk = e->new_chunk;
649 write_exception(ps, ps->current_committed++, &de); 653 write_exception(ps, ps->current_committed++, &ce);
650 654
651 /* 655 /*
652 * Add the callback to the back of the array. This code 656 * Add the callback to the back of the array. This code
@@ -670,7 +674,7 @@ static void persistent_commit_exception(struct dm_exception_store *store,
670 * If we completely filled the current area, then wipe the next one. 674 * If we completely filled the current area, then wipe the next one.
671 */ 675 */
672 if ((ps->current_committed == ps->exceptions_per_area) && 676 if ((ps->current_committed == ps->exceptions_per_area) &&
673 zero_disk_area(ps, ps->current_area + 1)) 677 zero_disk_area(ps, ps->current_area + 1))
674 ps->valid = 0; 678 ps->valid = 0;
675 679
676 /* 680 /*
@@ -701,7 +705,7 @@ static int persistent_prepare_merge(struct dm_exception_store *store,
701 chunk_t *last_new_chunk) 705 chunk_t *last_new_chunk)
702{ 706{
703 struct pstore *ps = get_info(store); 707 struct pstore *ps = get_info(store);
704 struct disk_exception de; 708 struct core_exception ce;
705 int nr_consecutive; 709 int nr_consecutive;
706 int r; 710 int r;
707 711
@@ -722,9 +726,9 @@ static int persistent_prepare_merge(struct dm_exception_store *store,
722 ps->current_committed = ps->exceptions_per_area; 726 ps->current_committed = ps->exceptions_per_area;
723 } 727 }
724 728
725 read_exception(ps, ps->current_committed - 1, &de); 729 read_exception(ps, ps->current_committed - 1, &ce);
726 *last_old_chunk = de.old_chunk; 730 *last_old_chunk = ce.old_chunk;
727 *last_new_chunk = de.new_chunk; 731 *last_new_chunk = ce.new_chunk;
728 732
729 /* 733 /*
730 * Find number of consecutive chunks within the current area, 734 * Find number of consecutive chunks within the current area,
@@ -733,9 +737,9 @@ static int persistent_prepare_merge(struct dm_exception_store *store,
733 for (nr_consecutive = 1; nr_consecutive < ps->current_committed; 737 for (nr_consecutive = 1; nr_consecutive < ps->current_committed;
734 nr_consecutive++) { 738 nr_consecutive++) {
735 read_exception(ps, ps->current_committed - 1 - nr_consecutive, 739 read_exception(ps, ps->current_committed - 1 - nr_consecutive,
736 &de); 740 &ce);
737 if (de.old_chunk != *last_old_chunk - nr_consecutive || 741 if (ce.old_chunk != *last_old_chunk - nr_consecutive ||
738 de.new_chunk != *last_new_chunk - nr_consecutive) 742 ce.new_chunk != *last_new_chunk - nr_consecutive)
739 break; 743 break;
740 } 744 }
741 745
@@ -753,7 +757,7 @@ static int persistent_commit_merge(struct dm_exception_store *store,
753 for (i = 0; i < nr_merged; i++) 757 for (i = 0; i < nr_merged; i++)
754 clear_exception(ps, ps->current_committed - 1 - i); 758 clear_exception(ps, ps->current_committed - 1 - i);
755 759
756 r = area_io(ps, WRITE); 760 r = area_io(ps, WRITE_FLUSH_FUA);
757 if (r < 0) 761 if (r < 0)
758 return r; 762 return r;
759 763
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 9ecff5f3023..6f758870fc1 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -30,16 +30,6 @@ static const char dm_snapshot_merge_target_name[] = "snapshot-merge";
30 ((ti)->type->name == dm_snapshot_merge_target_name) 30 ((ti)->type->name == dm_snapshot_merge_target_name)
31 31
32/* 32/*
33 * The percentage increment we will wake up users at
34 */
35#define WAKE_UP_PERCENT 5
36
37/*
38 * kcopyd priority of snapshot operations
39 */
40#define SNAPSHOT_COPY_PRIORITY 2
41
42/*
43 * The size of the mempool used to track chunks in use. 33 * The size of the mempool used to track chunks in use.
44 */ 34 */
45#define MIN_IOS 256 35#define MIN_IOS 256
@@ -180,6 +170,13 @@ struct dm_snap_pending_exception {
180 * kcopyd. 170 * kcopyd.
181 */ 171 */
182 int started; 172 int started;
173
174 /*
175 * For writing a complete chunk, bypassing the copy.
176 */
177 struct bio *full_bio;
178 bio_end_io_t *full_bio_end_io;
179 void *full_bio_private;
183}; 180};
184 181
185/* 182/*
@@ -1055,8 +1052,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1055 1052
1056 s = kmalloc(sizeof(*s), GFP_KERNEL); 1053 s = kmalloc(sizeof(*s), GFP_KERNEL);
1057 if (!s) { 1054 if (!s) {
1058 ti->error = "Cannot allocate snapshot context private " 1055 ti->error = "Cannot allocate private snapshot structure";
1059 "structure";
1060 r = -ENOMEM; 1056 r = -ENOMEM;
1061 goto bad; 1057 goto bad;
1062 } 1058 }
@@ -1380,6 +1376,7 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
1380 struct dm_snapshot *s = pe->snap; 1376 struct dm_snapshot *s = pe->snap;
1381 struct bio *origin_bios = NULL; 1377 struct bio *origin_bios = NULL;
1382 struct bio *snapshot_bios = NULL; 1378 struct bio *snapshot_bios = NULL;
1379 struct bio *full_bio = NULL;
1383 int error = 0; 1380 int error = 0;
1384 1381
1385 if (!success) { 1382 if (!success) {
@@ -1415,10 +1412,15 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
1415 */ 1412 */
1416 dm_insert_exception(&s->complete, e); 1413 dm_insert_exception(&s->complete, e);
1417 1414
1418 out: 1415out:
1419 dm_remove_exception(&pe->e); 1416 dm_remove_exception(&pe->e);
1420 snapshot_bios = bio_list_get(&pe->snapshot_bios); 1417 snapshot_bios = bio_list_get(&pe->snapshot_bios);
1421 origin_bios = bio_list_get(&pe->origin_bios); 1418 origin_bios = bio_list_get(&pe->origin_bios);
1419 full_bio = pe->full_bio;
1420 if (full_bio) {
1421 full_bio->bi_end_io = pe->full_bio_end_io;
1422 full_bio->bi_private = pe->full_bio_private;
1423 }
1422 free_pending_exception(pe); 1424 free_pending_exception(pe);
1423 1425
1424 increment_pending_exceptions_done_count(); 1426 increment_pending_exceptions_done_count();
@@ -1426,10 +1428,15 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
1426 up_write(&s->lock); 1428 up_write(&s->lock);
1427 1429
1428 /* Submit any pending write bios */ 1430 /* Submit any pending write bios */
1429 if (error) 1431 if (error) {
1432 if (full_bio)
1433 bio_io_error(full_bio);
1430 error_bios(snapshot_bios); 1434 error_bios(snapshot_bios);
1431 else 1435 } else {
1436 if (full_bio)
1437 bio_endio(full_bio, 0);
1432 flush_bios(snapshot_bios); 1438 flush_bios(snapshot_bios);
1439 }
1433 1440
1434 retry_origin_bios(s, origin_bios); 1441 retry_origin_bios(s, origin_bios);
1435} 1442}
@@ -1480,8 +1487,33 @@ static void start_copy(struct dm_snap_pending_exception *pe)
1480 dest.count = src.count; 1487 dest.count = src.count;
1481 1488
1482 /* Hand over to kcopyd */ 1489 /* Hand over to kcopyd */
1483 dm_kcopyd_copy(s->kcopyd_client, 1490 dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, copy_callback, pe);
1484 &src, 1, &dest, 0, copy_callback, pe); 1491}
1492
1493static void full_bio_end_io(struct bio *bio, int error)
1494{
1495 void *callback_data = bio->bi_private;
1496
1497 dm_kcopyd_do_callback(callback_data, 0, error ? 1 : 0);
1498}
1499
1500static void start_full_bio(struct dm_snap_pending_exception *pe,
1501 struct bio *bio)
1502{
1503 struct dm_snapshot *s = pe->snap;
1504 void *callback_data;
1505
1506 pe->full_bio = bio;
1507 pe->full_bio_end_io = bio->bi_end_io;
1508 pe->full_bio_private = bio->bi_private;
1509
1510 callback_data = dm_kcopyd_prepare_callback(s->kcopyd_client,
1511 copy_callback, pe);
1512
1513 bio->bi_end_io = full_bio_end_io;
1514 bio->bi_private = callback_data;
1515
1516 generic_make_request(bio);
1485} 1517}
1486 1518
1487static struct dm_snap_pending_exception * 1519static struct dm_snap_pending_exception *
@@ -1519,6 +1551,7 @@ __find_pending_exception(struct dm_snapshot *s,
1519 bio_list_init(&pe->origin_bios); 1551 bio_list_init(&pe->origin_bios);
1520 bio_list_init(&pe->snapshot_bios); 1552 bio_list_init(&pe->snapshot_bios);
1521 pe->started = 0; 1553 pe->started = 0;
1554 pe->full_bio = NULL;
1522 1555
1523 if (s->store->type->prepare_exception(s->store, &pe->e)) { 1556 if (s->store->type->prepare_exception(s->store, &pe->e)) {
1524 free_pending_exception(pe); 1557 free_pending_exception(pe);
@@ -1612,10 +1645,19 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
1612 } 1645 }
1613 1646
1614 remap_exception(s, &pe->e, bio, chunk); 1647 remap_exception(s, &pe->e, bio, chunk);
1615 bio_list_add(&pe->snapshot_bios, bio);
1616 1648
1617 r = DM_MAPIO_SUBMITTED; 1649 r = DM_MAPIO_SUBMITTED;
1618 1650
1651 if (!pe->started &&
1652 bio->bi_size == (s->store->chunk_size << SECTOR_SHIFT)) {
1653 pe->started = 1;
1654 up_write(&s->lock);
1655 start_full_bio(pe, bio);
1656 goto out;
1657 }
1658
1659 bio_list_add(&pe->snapshot_bios, bio);
1660
1619 if (!pe->started) { 1661 if (!pe->started) {
1620 /* this is protected by snap->lock */ 1662 /* this is protected by snap->lock */
1621 pe->started = 1; 1663 pe->started = 1;
@@ -1628,9 +1670,9 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
1628 map_context->ptr = track_chunk(s, chunk); 1670 map_context->ptr = track_chunk(s, chunk);
1629 } 1671 }
1630 1672
1631 out_unlock: 1673out_unlock:
1632 up_write(&s->lock); 1674 up_write(&s->lock);
1633 out: 1675out:
1634 return r; 1676 return r;
1635} 1677}
1636 1678
@@ -1974,7 +2016,7 @@ static int __origin_write(struct list_head *snapshots, sector_t sector,
1974 pe_to_start_now = pe; 2016 pe_to_start_now = pe;
1975 } 2017 }
1976 2018
1977 next_snapshot: 2019next_snapshot:
1978 up_write(&snap->lock); 2020 up_write(&snap->lock);
1979 2021
1980 if (pe_to_start_now) { 2022 if (pe_to_start_now) {
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 451c3bb176d..bc04518e9d8 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -17,7 +17,7 @@
17#include <linux/interrupt.h> 17#include <linux/interrupt.h>
18#include <linux/mutex.h> 18#include <linux/mutex.h>
19#include <linux/delay.h> 19#include <linux/delay.h>
20#include <asm/atomic.h> 20#include <linux/atomic.h>
21 21
22#define DM_MSG_PREFIX "table" 22#define DM_MSG_PREFIX "table"
23 23
@@ -54,7 +54,6 @@ struct dm_table {
54 sector_t *highs; 54 sector_t *highs;
55 struct dm_target *targets; 55 struct dm_target *targets;
56 56
57 unsigned discards_supported:1;
58 unsigned integrity_supported:1; 57 unsigned integrity_supported:1;
59 58
60 /* 59 /*
@@ -154,12 +153,11 @@ void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size)
154 return NULL; 153 return NULL;
155 154
156 size = nmemb * elem_size; 155 size = nmemb * elem_size;
157 addr = vmalloc(size); 156 addr = vzalloc(size);
158 if (addr)
159 memset(addr, 0, size);
160 157
161 return addr; 158 return addr;
162} 159}
160EXPORT_SYMBOL(dm_vcalloc);
163 161
164/* 162/*
165 * highs, and targets are managed as dynamic arrays during a 163 * highs, and targets are managed as dynamic arrays during a
@@ -209,7 +207,6 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
209 INIT_LIST_HEAD(&t->devices); 207 INIT_LIST_HEAD(&t->devices);
210 INIT_LIST_HEAD(&t->target_callbacks); 208 INIT_LIST_HEAD(&t->target_callbacks);
211 atomic_set(&t->holders, 0); 209 atomic_set(&t->holders, 0);
212 t->discards_supported = 1;
213 210
214 if (!num_targets) 211 if (!num_targets)
215 num_targets = KEYS_PER_NODE; 212 num_targets = KEYS_PER_NODE;
@@ -281,6 +278,7 @@ void dm_table_get(struct dm_table *t)
281{ 278{
282 atomic_inc(&t->holders); 279 atomic_inc(&t->holders);
283} 280}
281EXPORT_SYMBOL(dm_table_get);
284 282
285void dm_table_put(struct dm_table *t) 283void dm_table_put(struct dm_table *t)
286{ 284{
@@ -290,6 +288,7 @@ void dm_table_put(struct dm_table *t)
290 smp_mb__before_atomic_dec(); 288 smp_mb__before_atomic_dec();
291 atomic_dec(&t->holders); 289 atomic_dec(&t->holders);
292} 290}
291EXPORT_SYMBOL(dm_table_put);
293 292
294/* 293/*
295 * Checks to see if we need to extend highs or targets. 294 * Checks to see if we need to extend highs or targets.
@@ -455,13 +454,14 @@ static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode,
455 * Add a device to the list, or just increment the usage count if 454 * Add a device to the list, or just increment the usage count if
456 * it's already present. 455 * it's already present.
457 */ 456 */
458static int __table_get_device(struct dm_table *t, struct dm_target *ti, 457int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
459 const char *path, fmode_t mode, struct dm_dev **result) 458 struct dm_dev **result)
460{ 459{
461 int r; 460 int r;
462 dev_t uninitialized_var(dev); 461 dev_t uninitialized_var(dev);
463 struct dm_dev_internal *dd; 462 struct dm_dev_internal *dd;
464 unsigned int major, minor; 463 unsigned int major, minor;
464 struct dm_table *t = ti->table;
465 465
466 BUG_ON(!t); 466 BUG_ON(!t);
467 467
@@ -509,6 +509,7 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti,
509 *result = &dd->dm_dev; 509 *result = &dd->dm_dev;
510 return 0; 510 return 0;
511} 511}
512EXPORT_SYMBOL(dm_get_device);
512 513
513int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, 514int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
514 sector_t start, sector_t len, void *data) 515 sector_t start, sector_t len, void *data)
@@ -539,23 +540,15 @@ int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
539 * If not we'll force DM to use PAGE_SIZE or 540 * If not we'll force DM to use PAGE_SIZE or
540 * smaller I/O, just to be safe. 541 * smaller I/O, just to be safe.
541 */ 542 */
542 543 if (dm_queue_merge_is_compulsory(q) && !ti->type->merge)
543 if (q->merge_bvec_fn && !ti->type->merge)
544 blk_limits_max_hw_sectors(limits, 544 blk_limits_max_hw_sectors(limits,
545 (unsigned int) (PAGE_SIZE >> 9)); 545 (unsigned int) (PAGE_SIZE >> 9));
546 return 0; 546 return 0;
547} 547}
548EXPORT_SYMBOL_GPL(dm_set_device_limits); 548EXPORT_SYMBOL_GPL(dm_set_device_limits);
549 549
550int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
551 struct dm_dev **result)
552{
553 return __table_get_device(ti->table, ti, path, mode, result);
554}
555
556
557/* 550/*
558 * Decrement a devices use count and remove it if necessary. 551 * Decrement a device's use count and remove it if necessary.
559 */ 552 */
560void dm_put_device(struct dm_target *ti, struct dm_dev *d) 553void dm_put_device(struct dm_target *ti, struct dm_dev *d)
561{ 554{
@@ -568,6 +561,7 @@ void dm_put_device(struct dm_target *ti, struct dm_dev *d)
568 kfree(dd); 561 kfree(dd);
569 } 562 }
570} 563}
564EXPORT_SYMBOL(dm_put_device);
571 565
572/* 566/*
573 * Checks to see if the target joins onto the end of the table. 567 * Checks to see if the target joins onto the end of the table.
@@ -791,8 +785,9 @@ int dm_table_add_target(struct dm_table *t, const char *type,
791 785
792 t->highs[t->num_targets++] = tgt->begin + tgt->len - 1; 786 t->highs[t->num_targets++] = tgt->begin + tgt->len - 1;
793 787
794 if (!tgt->num_discard_requests) 788 if (!tgt->num_discard_requests && tgt->discards_supported)
795 t->discards_supported = 0; 789 DMWARN("%s: %s: ignoring discards_supported because num_discard_requests is zero.",
790 dm_device_name(t->md), type);
796 791
797 return 0; 792 return 0;
798 793
@@ -802,6 +797,63 @@ int dm_table_add_target(struct dm_table *t, const char *type,
802 return r; 797 return r;
803} 798}
804 799
800/*
801 * Target argument parsing helpers.
802 */
803static int validate_next_arg(struct dm_arg *arg, struct dm_arg_set *arg_set,
804 unsigned *value, char **error, unsigned grouped)
805{
806 const char *arg_str = dm_shift_arg(arg_set);
807
808 if (!arg_str ||
809 (sscanf(arg_str, "%u", value) != 1) ||
810 (*value < arg->min) ||
811 (*value > arg->max) ||
812 (grouped && arg_set->argc < *value)) {
813 *error = arg->error;
814 return -EINVAL;
815 }
816
817 return 0;
818}
819
820int dm_read_arg(struct dm_arg *arg, struct dm_arg_set *arg_set,
821 unsigned *value, char **error)
822{
823 return validate_next_arg(arg, arg_set, value, error, 0);
824}
825EXPORT_SYMBOL(dm_read_arg);
826
827int dm_read_arg_group(struct dm_arg *arg, struct dm_arg_set *arg_set,
828 unsigned *value, char **error)
829{
830 return validate_next_arg(arg, arg_set, value, error, 1);
831}
832EXPORT_SYMBOL(dm_read_arg_group);
833
834const char *dm_shift_arg(struct dm_arg_set *as)
835{
836 char *r;
837
838 if (as->argc) {
839 as->argc--;
840 r = *as->argv;
841 as->argv++;
842 return r;
843 }
844
845 return NULL;
846}
847EXPORT_SYMBOL(dm_shift_arg);
848
849void dm_consume_args(struct dm_arg_set *as, unsigned num_args)
850{
851 BUG_ON(as->argc < num_args);
852 as->argc -= num_args;
853 as->argv += num_args;
854}
855EXPORT_SYMBOL(dm_consume_args);
856
805static int dm_table_set_type(struct dm_table *t) 857static int dm_table_set_type(struct dm_table *t)
806{ 858{
807 unsigned i; 859 unsigned i;
@@ -1077,11 +1129,13 @@ void dm_table_event(struct dm_table *t)
1077 t->event_fn(t->event_context); 1129 t->event_fn(t->event_context);
1078 mutex_unlock(&_event_lock); 1130 mutex_unlock(&_event_lock);
1079} 1131}
1132EXPORT_SYMBOL(dm_table_event);
1080 1133
1081sector_t dm_table_get_size(struct dm_table *t) 1134sector_t dm_table_get_size(struct dm_table *t)
1082{ 1135{
1083 return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0; 1136 return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0;
1084} 1137}
1138EXPORT_SYMBOL(dm_table_get_size);
1085 1139
1086struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index) 1140struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index)
1087{ 1141{
@@ -1184,19 +1238,72 @@ static void dm_table_set_integrity(struct dm_table *t)
1184 return; 1238 return;
1185 1239
1186 template_disk = dm_table_get_integrity_disk(t, true); 1240 template_disk = dm_table_get_integrity_disk(t, true);
1187 if (!template_disk && 1241 if (template_disk)
1188 blk_integrity_is_initialized(dm_disk(t->md))) { 1242 blk_integrity_register(dm_disk(t->md),
1243 blk_get_integrity(template_disk));
1244 else if (blk_integrity_is_initialized(dm_disk(t->md)))
1189 DMWARN("%s: device no longer has a valid integrity profile", 1245 DMWARN("%s: device no longer has a valid integrity profile",
1190 dm_device_name(t->md)); 1246 dm_device_name(t->md));
1191 return; 1247 else
1248 DMWARN("%s: unable to establish an integrity profile",
1249 dm_device_name(t->md));
1250}
1251
1252static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev,
1253 sector_t start, sector_t len, void *data)
1254{
1255 unsigned flush = (*(unsigned *)data);
1256 struct request_queue *q = bdev_get_queue(dev->bdev);
1257
1258 return q && (q->flush_flags & flush);
1259}
1260
1261static bool dm_table_supports_flush(struct dm_table *t, unsigned flush)
1262{
1263 struct dm_target *ti;
1264 unsigned i = 0;
1265
1266 /*
1267 * Require at least one underlying device to support flushes.
1268 * t->devices includes internal dm devices such as mirror logs
1269 * so we need to use iterate_devices here, which targets
1270 * supporting flushes must provide.
1271 */
1272 while (i < dm_table_get_num_targets(t)) {
1273 ti = dm_table_get_target(t, i++);
1274
1275 if (!ti->num_flush_requests)
1276 continue;
1277
1278 if (ti->type->iterate_devices &&
1279 ti->type->iterate_devices(ti, device_flush_capable, &flush))
1280 return 1;
1281 }
1282
1283 return 0;
1284}
1285
1286static bool dm_table_discard_zeroes_data(struct dm_table *t)
1287{
1288 struct dm_target *ti;
1289 unsigned i = 0;
1290
1291 /* Ensure that all targets supports discard_zeroes_data. */
1292 while (i < dm_table_get_num_targets(t)) {
1293 ti = dm_table_get_target(t, i++);
1294
1295 if (ti->discard_zeroes_data_unsupported)
1296 return 0;
1192 } 1297 }
1193 blk_integrity_register(dm_disk(t->md), 1298
1194 blk_get_integrity(template_disk)); 1299 return 1;
1195} 1300}
1196 1301
1197void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, 1302void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
1198 struct queue_limits *limits) 1303 struct queue_limits *limits)
1199{ 1304{
1305 unsigned flush = 0;
1306
1200 /* 1307 /*
1201 * Copy table's limits to the DM device's request_queue 1308 * Copy table's limits to the DM device's request_queue
1202 */ 1309 */
@@ -1207,6 +1314,16 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
1207 else 1314 else
1208 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); 1315 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
1209 1316
1317 if (dm_table_supports_flush(t, REQ_FLUSH)) {
1318 flush |= REQ_FLUSH;
1319 if (dm_table_supports_flush(t, REQ_FUA))
1320 flush |= REQ_FUA;
1321 }
1322 blk_queue_flush(q, flush);
1323
1324 if (!dm_table_discard_zeroes_data(t))
1325 q->limits.discard_zeroes_data = 0;
1326
1210 dm_table_set_integrity(t); 1327 dm_table_set_integrity(t);
1211 1328
1212 /* 1329 /*
@@ -1237,6 +1354,7 @@ fmode_t dm_table_get_mode(struct dm_table *t)
1237{ 1354{
1238 return t->mode; 1355 return t->mode;
1239} 1356}
1357EXPORT_SYMBOL(dm_table_get_mode);
1240 1358
1241static void suspend_targets(struct dm_table *t, unsigned postsuspend) 1359static void suspend_targets(struct dm_table *t, unsigned postsuspend)
1242{ 1360{
@@ -1345,6 +1463,7 @@ struct mapped_device *dm_table_get_md(struct dm_table *t)
1345{ 1463{
1346 return t->md; 1464 return t->md;
1347} 1465}
1466EXPORT_SYMBOL(dm_table_get_md);
1348 1467
1349static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev, 1468static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev,
1350 sector_t start, sector_t len, void *data) 1469 sector_t start, sector_t len, void *data)
@@ -1359,19 +1478,19 @@ bool dm_table_supports_discards(struct dm_table *t)
1359 struct dm_target *ti; 1478 struct dm_target *ti;
1360 unsigned i = 0; 1479 unsigned i = 0;
1361 1480
1362 if (!t->discards_supported)
1363 return 0;
1364
1365 /* 1481 /*
1366 * Unless any target used by the table set discards_supported, 1482 * Unless any target used by the table set discards_supported,
1367 * require at least one underlying device to support discards. 1483 * require at least one underlying device to support discards.
1368 * t->devices includes internal dm devices such as mirror logs 1484 * t->devices includes internal dm devices such as mirror logs
1369 * so we need to use iterate_devices here, which targets 1485 * so we need to use iterate_devices here, which targets
1370 * supporting discard must provide. 1486 * supporting discard selectively must provide.
1371 */ 1487 */
1372 while (i < dm_table_get_num_targets(t)) { 1488 while (i < dm_table_get_num_targets(t)) {
1373 ti = dm_table_get_target(t, i++); 1489 ti = dm_table_get_target(t, i++);
1374 1490
1491 if (!ti->num_discard_requests)
1492 continue;
1493
1375 if (ti->discards_supported) 1494 if (ti->discards_supported)
1376 return 1; 1495 return 1;
1377 1496
@@ -1382,13 +1501,3 @@ bool dm_table_supports_discards(struct dm_table *t)
1382 1501
1383 return 0; 1502 return 0;
1384} 1503}
1385
1386EXPORT_SYMBOL(dm_vcalloc);
1387EXPORT_SYMBOL(dm_get_device);
1388EXPORT_SYMBOL(dm_put_device);
1389EXPORT_SYMBOL(dm_table_event);
1390EXPORT_SYMBOL(dm_table_get_size);
1391EXPORT_SYMBOL(dm_table_get_mode);
1392EXPORT_SYMBOL(dm_table_get_md);
1393EXPORT_SYMBOL(dm_table_put);
1394EXPORT_SYMBOL(dm_table_get);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 0cf68b47887..52b39f335bb 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -37,6 +37,8 @@ static const char *_name = DM_NAME;
37static unsigned int major = 0; 37static unsigned int major = 0;
38static unsigned int _major = 0; 38static unsigned int _major = 0;
39 39
40static DEFINE_IDR(_minor_idr);
41
40static DEFINE_SPINLOCK(_minor_lock); 42static DEFINE_SPINLOCK(_minor_lock);
41/* 43/*
42 * For bio-based dm. 44 * For bio-based dm.
@@ -109,6 +111,7 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
109#define DMF_FREEING 3 111#define DMF_FREEING 3
110#define DMF_DELETING 4 112#define DMF_DELETING 4
111#define DMF_NOFLUSH_SUSPENDING 5 113#define DMF_NOFLUSH_SUSPENDING 5
114#define DMF_MERGE_IS_OPTIONAL 6
112 115
113/* 116/*
114 * Work processed by per-device workqueue. 117 * Work processed by per-device workqueue.
@@ -313,6 +316,12 @@ static void __exit dm_exit(void)
313 316
314 while (i--) 317 while (i--)
315 _exits[i](); 318 _exits[i]();
319
320 /*
321 * Should be empty by this point.
322 */
323 idr_remove_all(&_minor_idr);
324 idr_destroy(&_minor_idr);
316} 325}
317 326
318/* 327/*
@@ -1171,7 +1180,8 @@ static int __clone_and_map_discard(struct clone_info *ci)
1171 1180
1172 /* 1181 /*
1173 * Even though the device advertised discard support, 1182 * Even though the device advertised discard support,
1174 * reconfiguration might have changed that since the 1183 * that does not mean every target supports it, and
1184 * reconfiguration might also have changed that since the
1175 * check was performed. 1185 * check was performed.
1176 */ 1186 */
1177 if (!ti->num_discard_requests) 1187 if (!ti->num_discard_requests)
@@ -1705,8 +1715,6 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
1705/*----------------------------------------------------------------- 1715/*-----------------------------------------------------------------
1706 * An IDR is used to keep track of allocated minor numbers. 1716 * An IDR is used to keep track of allocated minor numbers.
1707 *---------------------------------------------------------------*/ 1717 *---------------------------------------------------------------*/
1708static DEFINE_IDR(_minor_idr);
1709
1710static void free_minor(int minor) 1718static void free_minor(int minor)
1711{ 1719{
1712 spin_lock(&_minor_lock); 1720 spin_lock(&_minor_lock);
@@ -1800,7 +1808,6 @@ static void dm_init_md_queue(struct mapped_device *md)
1800 blk_queue_make_request(md->queue, dm_request); 1808 blk_queue_make_request(md->queue, dm_request);
1801 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); 1809 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
1802 blk_queue_merge_bvec(md->queue, dm_merge_bvec); 1810 blk_queue_merge_bvec(md->queue, dm_merge_bvec);
1803 blk_queue_flush(md->queue, REQ_FLUSH | REQ_FUA);
1804} 1811}
1805 1812
1806/* 1813/*
@@ -1986,6 +1993,59 @@ static void __set_size(struct mapped_device *md, sector_t size)
1986} 1993}
1987 1994
1988/* 1995/*
1996 * Return 1 if the queue has a compulsory merge_bvec_fn function.
1997 *
1998 * If this function returns 0, then the device is either a non-dm
1999 * device without a merge_bvec_fn, or it is a dm device that is
2000 * able to split any bios it receives that are too big.
2001 */
2002int dm_queue_merge_is_compulsory(struct request_queue *q)
2003{
2004 struct mapped_device *dev_md;
2005
2006 if (!q->merge_bvec_fn)
2007 return 0;
2008
2009 if (q->make_request_fn == dm_request) {
2010 dev_md = q->queuedata;
2011 if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags))
2012 return 0;
2013 }
2014
2015 return 1;
2016}
2017
2018static int dm_device_merge_is_compulsory(struct dm_target *ti,
2019 struct dm_dev *dev, sector_t start,
2020 sector_t len, void *data)
2021{
2022 struct block_device *bdev = dev->bdev;
2023 struct request_queue *q = bdev_get_queue(bdev);
2024
2025 return dm_queue_merge_is_compulsory(q);
2026}
2027
2028/*
2029 * Return 1 if it is acceptable to ignore merge_bvec_fn based
2030 * on the properties of the underlying devices.
2031 */
2032static int dm_table_merge_is_optional(struct dm_table *table)
2033{
2034 unsigned i = 0;
2035 struct dm_target *ti;
2036
2037 while (i < dm_table_get_num_targets(table)) {
2038 ti = dm_table_get_target(table, i++);
2039
2040 if (ti->type->iterate_devices &&
2041 ti->type->iterate_devices(ti, dm_device_merge_is_compulsory, NULL))
2042 return 0;
2043 }
2044
2045 return 1;
2046}
2047
2048/*
1989 * Returns old map, which caller must destroy. 2049 * Returns old map, which caller must destroy.
1990 */ 2050 */
1991static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, 2051static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
@@ -1995,6 +2055,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
1995 struct request_queue *q = md->queue; 2055 struct request_queue *q = md->queue;
1996 sector_t size; 2056 sector_t size;
1997 unsigned long flags; 2057 unsigned long flags;
2058 int merge_is_optional;
1998 2059
1999 size = dm_table_get_size(t); 2060 size = dm_table_get_size(t);
2000 2061
@@ -2020,10 +2081,16 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
2020 2081
2021 __bind_mempools(md, t); 2082 __bind_mempools(md, t);
2022 2083
2084 merge_is_optional = dm_table_merge_is_optional(t);
2085
2023 write_lock_irqsave(&md->map_lock, flags); 2086 write_lock_irqsave(&md->map_lock, flags);
2024 old_map = md->map; 2087 old_map = md->map;
2025 md->map = t; 2088 md->map = t;
2026 dm_table_set_restrictions(t, q, limits); 2089 dm_table_set_restrictions(t, q, limits);
2090 if (merge_is_optional)
2091 set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
2092 else
2093 clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
2027 write_unlock_irqrestore(&md->map_lock, flags); 2094 write_unlock_irqrestore(&md->map_lock, flags);
2028 2095
2029 return old_map; 2096 return old_map;
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 1aaf16746da..6745dbd278a 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -66,6 +66,8 @@ int dm_table_alloc_md_mempools(struct dm_table *t);
66void dm_table_free_md_mempools(struct dm_table *t); 66void dm_table_free_md_mempools(struct dm_table *t);
67struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t); 67struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
68 68
69int dm_queue_merge_is_compulsory(struct request_queue *q);
70
69void dm_lock_md_type(struct mapped_device *md); 71void dm_lock_md_type(struct mapped_device *md);
70void dm_unlock_md_type(struct mapped_device *md); 72void dm_unlock_md_type(struct mapped_device *md);
71void dm_set_md_type(struct mapped_device *md, unsigned type); 73void dm_set_md_type(struct mapped_device *md, unsigned type);
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index abfb59a61ed..6cd2c313e80 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -213,12 +213,6 @@ static int linear_run (mddev_t *mddev)
213 return md_integrity_register(mddev); 213 return md_integrity_register(mddev);
214} 214}
215 215
216static void free_conf(struct rcu_head *head)
217{
218 linear_conf_t *conf = container_of(head, linear_conf_t, rcu);
219 kfree(conf);
220}
221
222static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev) 216static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
223{ 217{
224 /* Adding a drive to a linear array allows the array to grow. 218 /* Adding a drive to a linear array allows the array to grow.
@@ -247,7 +241,7 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
247 md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); 241 md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
248 set_capacity(mddev->gendisk, mddev->array_sectors); 242 set_capacity(mddev->gendisk, mddev->array_sectors);
249 revalidate_disk(mddev->gendisk); 243 revalidate_disk(mddev->gendisk);
250 call_rcu(&oldconf->rcu, free_conf); 244 kfree_rcu(oldconf, rcu);
251 return 0; 245 return 0;
252} 246}
253 247
diff --git a/drivers/md/linear.h b/drivers/md/linear.h
index 0ce29b61605..2f2da05b2ce 100644
--- a/drivers/md/linear.h
+++ b/drivers/md/linear.h
@@ -10,9 +10,9 @@ typedef struct dev_info dev_info_t;
10 10
11struct linear_private_data 11struct linear_private_data
12{ 12{
13 struct rcu_head rcu;
13 sector_t array_sectors; 14 sector_t array_sectors;
14 dev_info_t disks[0]; 15 dev_info_t disks[0];
15 struct rcu_head rcu;
16}; 16};
17 17
18 18
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 91e31e260b4..5c95ccb5950 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -61,6 +61,11 @@
61static void autostart_arrays(int part); 61static void autostart_arrays(int part);
62#endif 62#endif
63 63
64/* pers_list is a list of registered personalities protected
65 * by pers_lock.
66 * pers_lock does extra service to protect accesses to
67 * mddev->thread when the mutex cannot be held.
68 */
64static LIST_HEAD(pers_list); 69static LIST_HEAD(pers_list);
65static DEFINE_SPINLOCK(pers_lock); 70static DEFINE_SPINLOCK(pers_lock);
66 71
@@ -215,6 +220,55 @@ struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
215} 220}
216EXPORT_SYMBOL_GPL(bio_clone_mddev); 221EXPORT_SYMBOL_GPL(bio_clone_mddev);
217 222
223void md_trim_bio(struct bio *bio, int offset, int size)
224{
225 /* 'bio' is a cloned bio which we need to trim to match
226 * the given offset and size.
227 * This requires adjusting bi_sector, bi_size, and bi_io_vec
228 */
229 int i;
230 struct bio_vec *bvec;
231 int sofar = 0;
232
233 size <<= 9;
234 if (offset == 0 && size == bio->bi_size)
235 return;
236
237 bio->bi_sector += offset;
238 bio->bi_size = size;
239 offset <<= 9;
240 clear_bit(BIO_SEG_VALID, &bio->bi_flags);
241
242 while (bio->bi_idx < bio->bi_vcnt &&
243 bio->bi_io_vec[bio->bi_idx].bv_len <= offset) {
244 /* remove this whole bio_vec */
245 offset -= bio->bi_io_vec[bio->bi_idx].bv_len;
246 bio->bi_idx++;
247 }
248 if (bio->bi_idx < bio->bi_vcnt) {
249 bio->bi_io_vec[bio->bi_idx].bv_offset += offset;
250 bio->bi_io_vec[bio->bi_idx].bv_len -= offset;
251 }
252 /* avoid any complications with bi_idx being non-zero*/
253 if (bio->bi_idx) {
254 memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx,
255 (bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec));
256 bio->bi_vcnt -= bio->bi_idx;
257 bio->bi_idx = 0;
258 }
259 /* Make sure vcnt and last bv are not too big */
260 bio_for_each_segment(bvec, bio, i) {
261 if (sofar + bvec->bv_len > size)
262 bvec->bv_len = size - sofar;
263 if (bvec->bv_len == 0) {
264 bio->bi_vcnt = i;
265 break;
266 }
267 sofar += bvec->bv_len;
268 }
269}
270EXPORT_SYMBOL_GPL(md_trim_bio);
271
218/* 272/*
219 * We have a system wide 'event count' that is incremented 273 * We have a system wide 'event count' that is incremented
220 * on any 'interesting' event, and readers of /proc/mdstat 274 * on any 'interesting' event, and readers of /proc/mdstat
@@ -690,7 +744,12 @@ static void mddev_unlock(mddev_t * mddev)
690 } else 744 } else
691 mutex_unlock(&mddev->reconfig_mutex); 745 mutex_unlock(&mddev->reconfig_mutex);
692 746
747 /* was we've dropped the mutex we need a spinlock to
748 * make sur the thread doesn't disappear
749 */
750 spin_lock(&pers_lock);
693 md_wakeup_thread(mddev->thread); 751 md_wakeup_thread(mddev->thread);
752 spin_unlock(&pers_lock);
694} 753}
695 754
696static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) 755static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
@@ -757,6 +816,10 @@ static void free_disk_sb(mdk_rdev_t * rdev)
757 rdev->sb_start = 0; 816 rdev->sb_start = 0;
758 rdev->sectors = 0; 817 rdev->sectors = 0;
759 } 818 }
819 if (rdev->bb_page) {
820 put_page(rdev->bb_page);
821 rdev->bb_page = NULL;
822 }
760} 823}
761 824
762 825
@@ -795,7 +858,7 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
795 bio->bi_end_io = super_written; 858 bio->bi_end_io = super_written;
796 859
797 atomic_inc(&mddev->pending_writes); 860 atomic_inc(&mddev->pending_writes);
798 submit_bio(REQ_WRITE | REQ_SYNC | REQ_FLUSH | REQ_FUA, bio); 861 submit_bio(WRITE_FLUSH_FUA, bio);
799} 862}
800 863
801void md_super_wait(mddev_t *mddev) 864void md_super_wait(mddev_t *mddev)
@@ -1025,7 +1088,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
1025 ret = -EINVAL; 1088 ret = -EINVAL;
1026 1089
1027 bdevname(rdev->bdev, b); 1090 bdevname(rdev->bdev, b);
1028 sb = (mdp_super_t*)page_address(rdev->sb_page); 1091 sb = page_address(rdev->sb_page);
1029 1092
1030 if (sb->md_magic != MD_SB_MAGIC) { 1093 if (sb->md_magic != MD_SB_MAGIC) {
1031 printk(KERN_ERR "md: invalid raid superblock magic on %s\n", 1094 printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
@@ -1054,6 +1117,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
1054 rdev->preferred_minor = sb->md_minor; 1117 rdev->preferred_minor = sb->md_minor;
1055 rdev->data_offset = 0; 1118 rdev->data_offset = 0;
1056 rdev->sb_size = MD_SB_BYTES; 1119 rdev->sb_size = MD_SB_BYTES;
1120 rdev->badblocks.shift = -1;
1057 1121
1058 if (sb->level == LEVEL_MULTIPATH) 1122 if (sb->level == LEVEL_MULTIPATH)
1059 rdev->desc_nr = -1; 1123 rdev->desc_nr = -1;
@@ -1064,7 +1128,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
1064 ret = 1; 1128 ret = 1;
1065 } else { 1129 } else {
1066 __u64 ev1, ev2; 1130 __u64 ev1, ev2;
1067 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); 1131 mdp_super_t *refsb = page_address(refdev->sb_page);
1068 if (!uuid_equal(refsb, sb)) { 1132 if (!uuid_equal(refsb, sb)) {
1069 printk(KERN_WARNING "md: %s has different UUID to %s\n", 1133 printk(KERN_WARNING "md: %s has different UUID to %s\n",
1070 b, bdevname(refdev->bdev,b2)); 1134 b, bdevname(refdev->bdev,b2));
@@ -1084,8 +1148,11 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
1084 ret = 0; 1148 ret = 0;
1085 } 1149 }
1086 rdev->sectors = rdev->sb_start; 1150 rdev->sectors = rdev->sb_start;
1151 /* Limit to 4TB as metadata cannot record more than that */
1152 if (rdev->sectors >= (2ULL << 32))
1153 rdev->sectors = (2ULL << 32) - 2;
1087 1154
1088 if (rdev->sectors < sb->size * 2 && sb->level > 1) 1155 if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1)
1089 /* "this cannot possibly happen" ... */ 1156 /* "this cannot possibly happen" ... */
1090 ret = -EINVAL; 1157 ret = -EINVAL;
1091 1158
@@ -1099,7 +1166,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
1099static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) 1166static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1100{ 1167{
1101 mdp_disk_t *desc; 1168 mdp_disk_t *desc;
1102 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); 1169 mdp_super_t *sb = page_address(rdev->sb_page);
1103 __u64 ev1 = md_event(sb); 1170 __u64 ev1 = md_event(sb);
1104 1171
1105 rdev->raid_disk = -1; 1172 rdev->raid_disk = -1;
@@ -1119,7 +1186,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1119 mddev->clevel[0] = 0; 1186 mddev->clevel[0] = 0;
1120 mddev->layout = sb->layout; 1187 mddev->layout = sb->layout;
1121 mddev->raid_disks = sb->raid_disks; 1188 mddev->raid_disks = sb->raid_disks;
1122 mddev->dev_sectors = sb->size * 2; 1189 mddev->dev_sectors = ((sector_t)sb->size) * 2;
1123 mddev->events = ev1; 1190 mddev->events = ev1;
1124 mddev->bitmap_info.offset = 0; 1191 mddev->bitmap_info.offset = 0;
1125 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 1192 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
@@ -1230,7 +1297,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1230 1297
1231 rdev->sb_size = MD_SB_BYTES; 1298 rdev->sb_size = MD_SB_BYTES;
1232 1299
1233 sb = (mdp_super_t*)page_address(rdev->sb_page); 1300 sb = page_address(rdev->sb_page);
1234 1301
1235 memset(sb, 0, sizeof(*sb)); 1302 memset(sb, 0, sizeof(*sb));
1236 1303
@@ -1361,6 +1428,11 @@ super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1361 rdev->sb_start = calc_dev_sboffset(rdev); 1428 rdev->sb_start = calc_dev_sboffset(rdev);
1362 if (!num_sectors || num_sectors > rdev->sb_start) 1429 if (!num_sectors || num_sectors > rdev->sb_start)
1363 num_sectors = rdev->sb_start; 1430 num_sectors = rdev->sb_start;
1431 /* Limit to 4TB as metadata cannot record more than that.
1432 * 4TB == 2^32 KB, or 2*2^32 sectors.
1433 */
1434 if (num_sectors >= (2ULL << 32))
1435 num_sectors = (2ULL << 32) - 2;
1364 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1436 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1365 rdev->sb_page); 1437 rdev->sb_page);
1366 md_super_wait(rdev->mddev); 1438 md_super_wait(rdev->mddev);
@@ -1395,6 +1467,8 @@ static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb)
1395 return cpu_to_le32(csum); 1467 return cpu_to_le32(csum);
1396} 1468}
1397 1469
1470static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
1471 int acknowledged);
1398static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 1472static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1399{ 1473{
1400 struct mdp_superblock_1 *sb; 1474 struct mdp_superblock_1 *sb;
@@ -1435,7 +1509,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1435 if (ret) return ret; 1509 if (ret) return ret;
1436 1510
1437 1511
1438 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1512 sb = page_address(rdev->sb_page);
1439 1513
1440 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 1514 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1441 sb->major_version != cpu_to_le32(1) || 1515 sb->major_version != cpu_to_le32(1) ||
@@ -1473,12 +1547,52 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1473 else 1547 else
1474 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1548 rdev->desc_nr = le32_to_cpu(sb->dev_number);
1475 1549
1550 if (!rdev->bb_page) {
1551 rdev->bb_page = alloc_page(GFP_KERNEL);
1552 if (!rdev->bb_page)
1553 return -ENOMEM;
1554 }
1555 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) &&
1556 rdev->badblocks.count == 0) {
1557 /* need to load the bad block list.
1558 * Currently we limit it to one page.
1559 */
1560 s32 offset;
1561 sector_t bb_sector;
1562 u64 *bbp;
1563 int i;
1564 int sectors = le16_to_cpu(sb->bblog_size);
1565 if (sectors > (PAGE_SIZE / 512))
1566 return -EINVAL;
1567 offset = le32_to_cpu(sb->bblog_offset);
1568 if (offset == 0)
1569 return -EINVAL;
1570 bb_sector = (long long)offset;
1571 if (!sync_page_io(rdev, bb_sector, sectors << 9,
1572 rdev->bb_page, READ, true))
1573 return -EIO;
1574 bbp = (u64 *)page_address(rdev->bb_page);
1575 rdev->badblocks.shift = sb->bblog_shift;
1576 for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) {
1577 u64 bb = le64_to_cpu(*bbp);
1578 int count = bb & (0x3ff);
1579 u64 sector = bb >> 10;
1580 sector <<= sb->bblog_shift;
1581 count <<= sb->bblog_shift;
1582 if (bb + 1 == 0)
1583 break;
1584 if (md_set_badblocks(&rdev->badblocks,
1585 sector, count, 1) == 0)
1586 return -EINVAL;
1587 }
1588 } else if (sb->bblog_offset == 0)
1589 rdev->badblocks.shift = -1;
1590
1476 if (!refdev) { 1591 if (!refdev) {
1477 ret = 1; 1592 ret = 1;
1478 } else { 1593 } else {
1479 __u64 ev1, ev2; 1594 __u64 ev1, ev2;
1480 struct mdp_superblock_1 *refsb = 1595 struct mdp_superblock_1 *refsb = page_address(refdev->sb_page);
1481 (struct mdp_superblock_1*)page_address(refdev->sb_page);
1482 1596
1483 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 1597 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1484 sb->level != refsb->level || 1598 sb->level != refsb->level ||
@@ -1513,7 +1627,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1513 1627
1514static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) 1628static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1515{ 1629{
1516 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1630 struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
1517 __u64 ev1 = le64_to_cpu(sb->events); 1631 __u64 ev1 = le64_to_cpu(sb->events);
1518 1632
1519 rdev->raid_disk = -1; 1633 rdev->raid_disk = -1;
@@ -1619,13 +1733,12 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1619 int max_dev, i; 1733 int max_dev, i;
1620 /* make rdev->sb match mddev and rdev data. */ 1734 /* make rdev->sb match mddev and rdev data. */
1621 1735
1622 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1736 sb = page_address(rdev->sb_page);
1623 1737
1624 sb->feature_map = 0; 1738 sb->feature_map = 0;
1625 sb->pad0 = 0; 1739 sb->pad0 = 0;
1626 sb->recovery_offset = cpu_to_le64(0); 1740 sb->recovery_offset = cpu_to_le64(0);
1627 memset(sb->pad1, 0, sizeof(sb->pad1)); 1741 memset(sb->pad1, 0, sizeof(sb->pad1));
1628 memset(sb->pad2, 0, sizeof(sb->pad2));
1629 memset(sb->pad3, 0, sizeof(sb->pad3)); 1742 memset(sb->pad3, 0, sizeof(sb->pad3));
1630 1743
1631 sb->utime = cpu_to_le64((__u64)mddev->utime); 1744 sb->utime = cpu_to_le64((__u64)mddev->utime);
@@ -1643,6 +1756,11 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1643 sb->level = cpu_to_le32(mddev->level); 1756 sb->level = cpu_to_le32(mddev->level);
1644 sb->layout = cpu_to_le32(mddev->layout); 1757 sb->layout = cpu_to_le32(mddev->layout);
1645 1758
1759 if (test_bit(WriteMostly, &rdev->flags))
1760 sb->devflags |= WriteMostly1;
1761 else
1762 sb->devflags &= ~WriteMostly1;
1763
1646 if (mddev->bitmap && mddev->bitmap_info.file == NULL) { 1764 if (mddev->bitmap && mddev->bitmap_info.file == NULL) {
1647 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); 1765 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);
1648 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 1766 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
@@ -1665,6 +1783,40 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1665 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); 1783 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
1666 } 1784 }
1667 1785
1786 if (rdev->badblocks.count == 0)
1787 /* Nothing to do for bad blocks*/ ;
1788 else if (sb->bblog_offset == 0)
1789 /* Cannot record bad blocks on this device */
1790 md_error(mddev, rdev);
1791 else {
1792 struct badblocks *bb = &rdev->badblocks;
1793 u64 *bbp = (u64 *)page_address(rdev->bb_page);
1794 u64 *p = bb->page;
1795 sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS);
1796 if (bb->changed) {
1797 unsigned seq;
1798
1799retry:
1800 seq = read_seqbegin(&bb->lock);
1801
1802 memset(bbp, 0xff, PAGE_SIZE);
1803
1804 for (i = 0 ; i < bb->count ; i++) {
1805 u64 internal_bb = *p++;
1806 u64 store_bb = ((BB_OFFSET(internal_bb) << 10)
1807 | BB_LEN(internal_bb));
1808 *bbp++ = cpu_to_le64(store_bb);
1809 }
1810 if (read_seqretry(&bb->lock, seq))
1811 goto retry;
1812
1813 bb->sector = (rdev->sb_start +
1814 (int)le32_to_cpu(sb->bblog_offset));
1815 bb->size = le16_to_cpu(sb->bblog_size);
1816 bb->changed = 0;
1817 }
1818 }
1819
1668 max_dev = 0; 1820 max_dev = 0;
1669 list_for_each_entry(rdev2, &mddev->disks, same_set) 1821 list_for_each_entry(rdev2, &mddev->disks, same_set)
1670 if (rdev2->desc_nr+1 > max_dev) 1822 if (rdev2->desc_nr+1 > max_dev)
@@ -1724,7 +1876,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1724 num_sectors = max_sectors; 1876 num_sectors = max_sectors;
1725 rdev->sb_start = sb_start; 1877 rdev->sb_start = sb_start;
1726 } 1878 }
1727 sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page); 1879 sb = page_address(rdev->sb_page);
1728 sb->data_size = cpu_to_le64(num_sectors); 1880 sb->data_size = cpu_to_le64(num_sectors);
1729 sb->super_offset = rdev->sb_start; 1881 sb->super_offset = rdev->sb_start;
1730 sb->sb_csum = calc_sb_1_csum(sb); 1882 sb->sb_csum = calc_sb_1_csum(sb);
@@ -1922,7 +2074,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1922 bd_link_disk_holder(rdev->bdev, mddev->gendisk); 2074 bd_link_disk_holder(rdev->bdev, mddev->gendisk);
1923 2075
1924 /* May as well allow recovery to be retried once */ 2076 /* May as well allow recovery to be retried once */
1925 mddev->recovery_disabled = 0; 2077 mddev->recovery_disabled++;
1926 2078
1927 return 0; 2079 return 0;
1928 2080
@@ -1953,6 +2105,9 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)
1953 sysfs_remove_link(&rdev->kobj, "block"); 2105 sysfs_remove_link(&rdev->kobj, "block");
1954 sysfs_put(rdev->sysfs_state); 2106 sysfs_put(rdev->sysfs_state);
1955 rdev->sysfs_state = NULL; 2107 rdev->sysfs_state = NULL;
2108 kfree(rdev->badblocks.page);
2109 rdev->badblocks.count = 0;
2110 rdev->badblocks.page = NULL;
1956 /* We need to delay this, otherwise we can deadlock when 2111 /* We need to delay this, otherwise we can deadlock when
1957 * writing to 'remove' to "dev/state". We also need 2112 * writing to 'remove' to "dev/state". We also need
1958 * to delay it due to rcu usage. 2113 * to delay it due to rcu usage.
@@ -2127,10 +2282,10 @@ static void print_rdev(mdk_rdev_t *rdev, int major_version)
2127 printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version); 2282 printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version);
2128 switch (major_version) { 2283 switch (major_version) {
2129 case 0: 2284 case 0:
2130 print_sb_90((mdp_super_t*)page_address(rdev->sb_page)); 2285 print_sb_90(page_address(rdev->sb_page));
2131 break; 2286 break;
2132 case 1: 2287 case 1:
2133 print_sb_1((struct mdp_superblock_1 *)page_address(rdev->sb_page)); 2288 print_sb_1(page_address(rdev->sb_page));
2134 break; 2289 break;
2135 } 2290 }
2136 } else 2291 } else
@@ -2194,6 +2349,7 @@ static void md_update_sb(mddev_t * mddev, int force_change)
2194 mdk_rdev_t *rdev; 2349 mdk_rdev_t *rdev;
2195 int sync_req; 2350 int sync_req;
2196 int nospares = 0; 2351 int nospares = 0;
2352 int any_badblocks_changed = 0;
2197 2353
2198repeat: 2354repeat:
2199 /* First make sure individual recovery_offsets are correct */ 2355 /* First make sure individual recovery_offsets are correct */
@@ -2208,8 +2364,18 @@ repeat:
2208 if (!mddev->persistent) { 2364 if (!mddev->persistent) {
2209 clear_bit(MD_CHANGE_CLEAN, &mddev->flags); 2365 clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
2210 clear_bit(MD_CHANGE_DEVS, &mddev->flags); 2366 clear_bit(MD_CHANGE_DEVS, &mddev->flags);
2211 if (!mddev->external) 2367 if (!mddev->external) {
2212 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 2368 clear_bit(MD_CHANGE_PENDING, &mddev->flags);
2369 list_for_each_entry(rdev, &mddev->disks, same_set) {
2370 if (rdev->badblocks.changed) {
2371 md_ack_all_badblocks(&rdev->badblocks);
2372 md_error(mddev, rdev);
2373 }
2374 clear_bit(Blocked, &rdev->flags);
2375 clear_bit(BlockedBadBlocks, &rdev->flags);
2376 wake_up(&rdev->blocked_wait);
2377 }
2378 }
2213 wake_up(&mddev->sb_wait); 2379 wake_up(&mddev->sb_wait);
2214 return; 2380 return;
2215 } 2381 }
@@ -2265,6 +2431,14 @@ repeat:
2265 MD_BUG(); 2431 MD_BUG();
2266 mddev->events --; 2432 mddev->events --;
2267 } 2433 }
2434
2435 list_for_each_entry(rdev, &mddev->disks, same_set) {
2436 if (rdev->badblocks.changed)
2437 any_badblocks_changed++;
2438 if (test_bit(Faulty, &rdev->flags))
2439 set_bit(FaultRecorded, &rdev->flags);
2440 }
2441
2268 sync_sbs(mddev, nospares); 2442 sync_sbs(mddev, nospares);
2269 spin_unlock_irq(&mddev->write_lock); 2443 spin_unlock_irq(&mddev->write_lock);
2270 2444
@@ -2290,6 +2464,13 @@ repeat:
2290 bdevname(rdev->bdev,b), 2464 bdevname(rdev->bdev,b),
2291 (unsigned long long)rdev->sb_start); 2465 (unsigned long long)rdev->sb_start);
2292 rdev->sb_events = mddev->events; 2466 rdev->sb_events = mddev->events;
2467 if (rdev->badblocks.size) {
2468 md_super_write(mddev, rdev,
2469 rdev->badblocks.sector,
2470 rdev->badblocks.size << 9,
2471 rdev->bb_page);
2472 rdev->badblocks.size = 0;
2473 }
2293 2474
2294 } else 2475 } else
2295 dprintk(")\n"); 2476 dprintk(")\n");
@@ -2313,6 +2494,15 @@ repeat:
2313 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 2494 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2314 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 2495 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
2315 2496
2497 list_for_each_entry(rdev, &mddev->disks, same_set) {
2498 if (test_and_clear_bit(FaultRecorded, &rdev->flags))
2499 clear_bit(Blocked, &rdev->flags);
2500
2501 if (any_badblocks_changed)
2502 md_ack_all_badblocks(&rdev->badblocks);
2503 clear_bit(BlockedBadBlocks, &rdev->flags);
2504 wake_up(&rdev->blocked_wait);
2505 }
2316} 2506}
2317 2507
2318/* words written to sysfs files may, or may not, be \n terminated. 2508/* words written to sysfs files may, or may not, be \n terminated.
@@ -2347,7 +2537,8 @@ state_show(mdk_rdev_t *rdev, char *page)
2347 char *sep = ""; 2537 char *sep = "";
2348 size_t len = 0; 2538 size_t len = 0;
2349 2539
2350 if (test_bit(Faulty, &rdev->flags)) { 2540 if (test_bit(Faulty, &rdev->flags) ||
2541 rdev->badblocks.unacked_exist) {
2351 len+= sprintf(page+len, "%sfaulty",sep); 2542 len+= sprintf(page+len, "%sfaulty",sep);
2352 sep = ","; 2543 sep = ",";
2353 } 2544 }
@@ -2359,7 +2550,8 @@ state_show(mdk_rdev_t *rdev, char *page)
2359 len += sprintf(page+len, "%swrite_mostly",sep); 2550 len += sprintf(page+len, "%swrite_mostly",sep);
2360 sep = ","; 2551 sep = ",";
2361 } 2552 }
2362 if (test_bit(Blocked, &rdev->flags)) { 2553 if (test_bit(Blocked, &rdev->flags) ||
2554 rdev->badblocks.unacked_exist) {
2363 len += sprintf(page+len, "%sblocked", sep); 2555 len += sprintf(page+len, "%sblocked", sep);
2364 sep = ","; 2556 sep = ",";
2365 } 2557 }
@@ -2368,6 +2560,10 @@ state_show(mdk_rdev_t *rdev, char *page)
2368 len += sprintf(page+len, "%sspare", sep); 2560 len += sprintf(page+len, "%sspare", sep);
2369 sep = ","; 2561 sep = ",";
2370 } 2562 }
2563 if (test_bit(WriteErrorSeen, &rdev->flags)) {
2564 len += sprintf(page+len, "%swrite_error", sep);
2565 sep = ",";
2566 }
2371 return len+sprintf(page+len, "\n"); 2567 return len+sprintf(page+len, "\n");
2372} 2568}
2373 2569
@@ -2375,18 +2571,23 @@ static ssize_t
2375state_store(mdk_rdev_t *rdev, const char *buf, size_t len) 2571state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2376{ 2572{
2377 /* can write 2573 /* can write
2378 * faulty - simulates and error 2574 * faulty - simulates an error
2379 * remove - disconnects the device 2575 * remove - disconnects the device
2380 * writemostly - sets write_mostly 2576 * writemostly - sets write_mostly
2381 * -writemostly - clears write_mostly 2577 * -writemostly - clears write_mostly
2382 * blocked - sets the Blocked flag 2578 * blocked - sets the Blocked flags
2383 * -blocked - clears the Blocked flag 2579 * -blocked - clears the Blocked and possibly simulates an error
2384 * insync - sets Insync providing device isn't active 2580 * insync - sets Insync providing device isn't active
2581 * write_error - sets WriteErrorSeen
2582 * -write_error - clears WriteErrorSeen
2385 */ 2583 */
2386 int err = -EINVAL; 2584 int err = -EINVAL;
2387 if (cmd_match(buf, "faulty") && rdev->mddev->pers) { 2585 if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
2388 md_error(rdev->mddev, rdev); 2586 md_error(rdev->mddev, rdev);
2389 err = 0; 2587 if (test_bit(Faulty, &rdev->flags))
2588 err = 0;
2589 else
2590 err = -EBUSY;
2390 } else if (cmd_match(buf, "remove")) { 2591 } else if (cmd_match(buf, "remove")) {
2391 if (rdev->raid_disk >= 0) 2592 if (rdev->raid_disk >= 0)
2392 err = -EBUSY; 2593 err = -EBUSY;
@@ -2408,7 +2609,15 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2408 set_bit(Blocked, &rdev->flags); 2609 set_bit(Blocked, &rdev->flags);
2409 err = 0; 2610 err = 0;
2410 } else if (cmd_match(buf, "-blocked")) { 2611 } else if (cmd_match(buf, "-blocked")) {
2612 if (!test_bit(Faulty, &rdev->flags) &&
2613 rdev->badblocks.unacked_exist) {
2614 /* metadata handler doesn't understand badblocks,
2615 * so we need to fail the device
2616 */
2617 md_error(rdev->mddev, rdev);
2618 }
2411 clear_bit(Blocked, &rdev->flags); 2619 clear_bit(Blocked, &rdev->flags);
2620 clear_bit(BlockedBadBlocks, &rdev->flags);
2412 wake_up(&rdev->blocked_wait); 2621 wake_up(&rdev->blocked_wait);
2413 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2622 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2414 md_wakeup_thread(rdev->mddev->thread); 2623 md_wakeup_thread(rdev->mddev->thread);
@@ -2417,6 +2626,12 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2417 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { 2626 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
2418 set_bit(In_sync, &rdev->flags); 2627 set_bit(In_sync, &rdev->flags);
2419 err = 0; 2628 err = 0;
2629 } else if (cmd_match(buf, "write_error")) {
2630 set_bit(WriteErrorSeen, &rdev->flags);
2631 err = 0;
2632 } else if (cmd_match(buf, "-write_error")) {
2633 clear_bit(WriteErrorSeen, &rdev->flags);
2634 err = 0;
2420 } 2635 }
2421 if (!err) 2636 if (!err)
2422 sysfs_notify_dirent_safe(rdev->sysfs_state); 2637 sysfs_notify_dirent_safe(rdev->sysfs_state);
@@ -2459,7 +2674,6 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2459{ 2674{
2460 char *e; 2675 char *e;
2461 int err; 2676 int err;
2462 char nm[20];
2463 int slot = simple_strtoul(buf, &e, 10); 2677 int slot = simple_strtoul(buf, &e, 10);
2464 if (strncmp(buf, "none", 4)==0) 2678 if (strncmp(buf, "none", 4)==0)
2465 slot = -1; 2679 slot = -1;
@@ -2482,8 +2696,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2482 hot_remove_disk(rdev->mddev, rdev->raid_disk); 2696 hot_remove_disk(rdev->mddev, rdev->raid_disk);
2483 if (err) 2697 if (err)
2484 return err; 2698 return err;
2485 sprintf(nm, "rd%d", rdev->raid_disk); 2699 sysfs_unlink_rdev(rdev->mddev, rdev);
2486 sysfs_remove_link(&rdev->mddev->kobj, nm);
2487 rdev->raid_disk = -1; 2700 rdev->raid_disk = -1;
2488 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2701 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2489 md_wakeup_thread(rdev->mddev->thread); 2702 md_wakeup_thread(rdev->mddev->thread);
@@ -2522,8 +2735,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2522 return err; 2735 return err;
2523 } else 2736 } else
2524 sysfs_notify_dirent_safe(rdev->sysfs_state); 2737 sysfs_notify_dirent_safe(rdev->sysfs_state);
2525 sprintf(nm, "rd%d", rdev->raid_disk); 2738 if (sysfs_link_rdev(rdev->mddev, rdev))
2526 if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm))
2527 /* failure here is OK */; 2739 /* failure here is OK */;
2528 /* don't wakeup anyone, leave that to userspace. */ 2740 /* don't wakeup anyone, leave that to userspace. */
2529 } else { 2741 } else {
@@ -2712,6 +2924,39 @@ static ssize_t recovery_start_store(mdk_rdev_t *rdev, const char *buf, size_t le
2712static struct rdev_sysfs_entry rdev_recovery_start = 2924static struct rdev_sysfs_entry rdev_recovery_start =
2713__ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); 2925__ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);
2714 2926
2927
2928static ssize_t
2929badblocks_show(struct badblocks *bb, char *page, int unack);
2930static ssize_t
2931badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack);
2932
2933static ssize_t bb_show(mdk_rdev_t *rdev, char *page)
2934{
2935 return badblocks_show(&rdev->badblocks, page, 0);
2936}
2937static ssize_t bb_store(mdk_rdev_t *rdev, const char *page, size_t len)
2938{
2939 int rv = badblocks_store(&rdev->badblocks, page, len, 0);
2940 /* Maybe that ack was all we needed */
2941 if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags))
2942 wake_up(&rdev->blocked_wait);
2943 return rv;
2944}
2945static struct rdev_sysfs_entry rdev_bad_blocks =
2946__ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store);
2947
2948
2949static ssize_t ubb_show(mdk_rdev_t *rdev, char *page)
2950{
2951 return badblocks_show(&rdev->badblocks, page, 1);
2952}
2953static ssize_t ubb_store(mdk_rdev_t *rdev, const char *page, size_t len)
2954{
2955 return badblocks_store(&rdev->badblocks, page, len, 1);
2956}
2957static struct rdev_sysfs_entry rdev_unack_bad_blocks =
2958__ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store);
2959
2715static struct attribute *rdev_default_attrs[] = { 2960static struct attribute *rdev_default_attrs[] = {
2716 &rdev_state.attr, 2961 &rdev_state.attr,
2717 &rdev_errors.attr, 2962 &rdev_errors.attr,
@@ -2719,6 +2964,8 @@ static struct attribute *rdev_default_attrs[] = {
2719 &rdev_offset.attr, 2964 &rdev_offset.attr,
2720 &rdev_size.attr, 2965 &rdev_size.attr,
2721 &rdev_recovery_start.attr, 2966 &rdev_recovery_start.attr,
2967 &rdev_bad_blocks.attr,
2968 &rdev_unack_bad_blocks.attr,
2722 NULL, 2969 NULL,
2723}; 2970};
2724static ssize_t 2971static ssize_t
@@ -2782,7 +3029,7 @@ static struct kobj_type rdev_ktype = {
2782 .default_attrs = rdev_default_attrs, 3029 .default_attrs = rdev_default_attrs,
2783}; 3030};
2784 3031
2785void md_rdev_init(mdk_rdev_t *rdev) 3032int md_rdev_init(mdk_rdev_t *rdev)
2786{ 3033{
2787 rdev->desc_nr = -1; 3034 rdev->desc_nr = -1;
2788 rdev->saved_raid_disk = -1; 3035 rdev->saved_raid_disk = -1;
@@ -2792,12 +3039,27 @@ void md_rdev_init(mdk_rdev_t *rdev)
2792 rdev->sb_events = 0; 3039 rdev->sb_events = 0;
2793 rdev->last_read_error.tv_sec = 0; 3040 rdev->last_read_error.tv_sec = 0;
2794 rdev->last_read_error.tv_nsec = 0; 3041 rdev->last_read_error.tv_nsec = 0;
3042 rdev->sb_loaded = 0;
3043 rdev->bb_page = NULL;
2795 atomic_set(&rdev->nr_pending, 0); 3044 atomic_set(&rdev->nr_pending, 0);
2796 atomic_set(&rdev->read_errors, 0); 3045 atomic_set(&rdev->read_errors, 0);
2797 atomic_set(&rdev->corrected_errors, 0); 3046 atomic_set(&rdev->corrected_errors, 0);
2798 3047
2799 INIT_LIST_HEAD(&rdev->same_set); 3048 INIT_LIST_HEAD(&rdev->same_set);
2800 init_waitqueue_head(&rdev->blocked_wait); 3049 init_waitqueue_head(&rdev->blocked_wait);
3050
3051 /* Add space to store bad block list.
3052 * This reserves the space even on arrays where it cannot
3053 * be used - I wonder if that matters
3054 */
3055 rdev->badblocks.count = 0;
3056 rdev->badblocks.shift = 0;
3057 rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL);
3058 seqlock_init(&rdev->badblocks.lock);
3059 if (rdev->badblocks.page == NULL)
3060 return -ENOMEM;
3061
3062 return 0;
2801} 3063}
2802EXPORT_SYMBOL_GPL(md_rdev_init); 3064EXPORT_SYMBOL_GPL(md_rdev_init);
2803/* 3065/*
@@ -2823,8 +3085,11 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
2823 return ERR_PTR(-ENOMEM); 3085 return ERR_PTR(-ENOMEM);
2824 } 3086 }
2825 3087
2826 md_rdev_init(rdev); 3088 err = md_rdev_init(rdev);
2827 if ((err = alloc_disk_sb(rdev))) 3089 if (err)
3090 goto abort_free;
3091 err = alloc_disk_sb(rdev);
3092 if (err)
2828 goto abort_free; 3093 goto abort_free;
2829 3094
2830 err = lock_rdev(rdev, newdev, super_format == -2); 3095 err = lock_rdev(rdev, newdev, super_format == -2);
@@ -2860,15 +3125,17 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
2860 goto abort_free; 3125 goto abort_free;
2861 } 3126 }
2862 } 3127 }
3128 if (super_format == -1)
3129 /* hot-add for 0.90, or non-persistent: so no badblocks */
3130 rdev->badblocks.shift = -1;
2863 3131
2864 return rdev; 3132 return rdev;
2865 3133
2866abort_free: 3134abort_free:
2867 if (rdev->sb_page) { 3135 if (rdev->bdev)
2868 if (rdev->bdev) 3136 unlock_rdev(rdev);
2869 unlock_rdev(rdev); 3137 free_disk_sb(rdev);
2870 free_disk_sb(rdev); 3138 kfree(rdev->badblocks.page);
2871 }
2872 kfree(rdev); 3139 kfree(rdev);
2873 return ERR_PTR(err); 3140 return ERR_PTR(err);
2874} 3141}
@@ -3149,15 +3416,13 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
3149 } 3416 }
3150 3417
3151 list_for_each_entry(rdev, &mddev->disks, same_set) { 3418 list_for_each_entry(rdev, &mddev->disks, same_set) {
3152 char nm[20];
3153 if (rdev->raid_disk < 0) 3419 if (rdev->raid_disk < 0)
3154 continue; 3420 continue;
3155 if (rdev->new_raid_disk >= mddev->raid_disks) 3421 if (rdev->new_raid_disk >= mddev->raid_disks)
3156 rdev->new_raid_disk = -1; 3422 rdev->new_raid_disk = -1;
3157 if (rdev->new_raid_disk == rdev->raid_disk) 3423 if (rdev->new_raid_disk == rdev->raid_disk)
3158 continue; 3424 continue;
3159 sprintf(nm, "rd%d", rdev->raid_disk); 3425 sysfs_unlink_rdev(mddev, rdev);
3160 sysfs_remove_link(&mddev->kobj, nm);
3161 } 3426 }
3162 list_for_each_entry(rdev, &mddev->disks, same_set) { 3427 list_for_each_entry(rdev, &mddev->disks, same_set) {
3163 if (rdev->raid_disk < 0) 3428 if (rdev->raid_disk < 0)
@@ -3168,11 +3433,10 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
3168 if (rdev->raid_disk < 0) 3433 if (rdev->raid_disk < 0)
3169 clear_bit(In_sync, &rdev->flags); 3434 clear_bit(In_sync, &rdev->flags);
3170 else { 3435 else {
3171 char nm[20]; 3436 if (sysfs_link_rdev(mddev, rdev))
3172 sprintf(nm, "rd%d", rdev->raid_disk); 3437 printk(KERN_WARNING "md: cannot register rd%d"
3173 if(sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) 3438 " for %s after level change\n",
3174 printk("md: cannot register %s for %s after level change\n", 3439 rdev->raid_disk, mdname(mddev));
3175 nm, mdname(mddev));
3176 } 3440 }
3177 } 3441 }
3178 3442
@@ -4504,7 +4768,8 @@ int md_run(mddev_t *mddev)
4504 } 4768 }
4505 4769
4506 if (mddev->bio_set == NULL) 4770 if (mddev->bio_set == NULL)
4507 mddev->bio_set = bioset_create(BIO_POOL_SIZE, sizeof(mddev)); 4771 mddev->bio_set = bioset_create(BIO_POOL_SIZE,
4772 sizeof(mddev_t *));
4508 4773
4509 spin_lock(&pers_lock); 4774 spin_lock(&pers_lock);
4510 pers = find_pers(mddev->level, mddev->clevel); 4775 pers = find_pers(mddev->level, mddev->clevel);
@@ -4621,12 +4886,9 @@ int md_run(mddev_t *mddev)
4621 smp_wmb(); 4886 smp_wmb();
4622 mddev->ready = 1; 4887 mddev->ready = 1;
4623 list_for_each_entry(rdev, &mddev->disks, same_set) 4888 list_for_each_entry(rdev, &mddev->disks, same_set)
4624 if (rdev->raid_disk >= 0) { 4889 if (rdev->raid_disk >= 0)
4625 char nm[20]; 4890 if (sysfs_link_rdev(mddev, rdev))
4626 sprintf(nm, "rd%d", rdev->raid_disk);
4627 if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm))
4628 /* failure here is OK */; 4891 /* failure here is OK */;
4629 }
4630 4892
4631 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4893 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4632 4894
@@ -4854,11 +5116,8 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
4854 sysfs_notify_dirent_safe(mddev->sysfs_state); 5116 sysfs_notify_dirent_safe(mddev->sysfs_state);
4855 5117
4856 list_for_each_entry(rdev, &mddev->disks, same_set) 5118 list_for_each_entry(rdev, &mddev->disks, same_set)
4857 if (rdev->raid_disk >= 0) { 5119 if (rdev->raid_disk >= 0)
4858 char nm[20]; 5120 sysfs_unlink_rdev(mddev, rdev);
4859 sprintf(nm, "rd%d", rdev->raid_disk);
4860 sysfs_remove_link(&mddev->kobj, nm);
4861 }
4862 5121
4863 set_capacity(disk, 0); 5122 set_capacity(disk, 0);
4864 mutex_unlock(&mddev->open_mutex); 5123 mutex_unlock(&mddev->open_mutex);
@@ -5750,6 +6009,8 @@ static int set_disk_faulty(mddev_t *mddev, dev_t dev)
5750 return -ENODEV; 6009 return -ENODEV;
5751 6010
5752 md_error(mddev, rdev); 6011 md_error(mddev, rdev);
6012 if (!test_bit(Faulty, &rdev->flags))
6013 return -EBUSY;
5753 return 0; 6014 return 0;
5754} 6015}
5755 6016
@@ -6178,11 +6439,18 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
6178 return thread; 6439 return thread;
6179} 6440}
6180 6441
6181void md_unregister_thread(mdk_thread_t *thread) 6442void md_unregister_thread(mdk_thread_t **threadp)
6182{ 6443{
6444 mdk_thread_t *thread = *threadp;
6183 if (!thread) 6445 if (!thread)
6184 return; 6446 return;
6185 dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); 6447 dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
6448 /* Locking ensures that mddev_unlock does not wake_up a
6449 * non-existent thread
6450 */
6451 spin_lock(&pers_lock);
6452 *threadp = NULL;
6453 spin_unlock(&pers_lock);
6186 6454
6187 kthread_stop(thread->tsk); 6455 kthread_stop(thread->tsk);
6188 kfree(thread); 6456 kfree(thread);
@@ -6198,18 +6466,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
6198 if (!rdev || test_bit(Faulty, &rdev->flags)) 6466 if (!rdev || test_bit(Faulty, &rdev->flags))
6199 return; 6467 return;
6200 6468
6201 if (mddev->external) 6469 if (!mddev->pers || !mddev->pers->error_handler)
6202 set_bit(Blocked, &rdev->flags);
6203/*
6204 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
6205 mdname(mddev),
6206 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev),
6207 __builtin_return_address(0),__builtin_return_address(1),
6208 __builtin_return_address(2),__builtin_return_address(3));
6209*/
6210 if (!mddev->pers)
6211 return;
6212 if (!mddev->pers->error_handler)
6213 return; 6470 return;
6214 mddev->pers->error_handler(mddev,rdev); 6471 mddev->pers->error_handler(mddev,rdev);
6215 if (mddev->degraded) 6472 if (mddev->degraded)
@@ -6394,16 +6651,11 @@ static void md_seq_stop(struct seq_file *seq, void *v)
6394 mddev_put(mddev); 6651 mddev_put(mddev);
6395} 6652}
6396 6653
6397struct mdstat_info {
6398 int event;
6399};
6400
6401static int md_seq_show(struct seq_file *seq, void *v) 6654static int md_seq_show(struct seq_file *seq, void *v)
6402{ 6655{
6403 mddev_t *mddev = v; 6656 mddev_t *mddev = v;
6404 sector_t sectors; 6657 sector_t sectors;
6405 mdk_rdev_t *rdev; 6658 mdk_rdev_t *rdev;
6406 struct mdstat_info *mi = seq->private;
6407 struct bitmap *bitmap; 6659 struct bitmap *bitmap;
6408 6660
6409 if (v == (void*)1) { 6661 if (v == (void*)1) {
@@ -6415,7 +6667,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
6415 6667
6416 spin_unlock(&pers_lock); 6668 spin_unlock(&pers_lock);
6417 seq_printf(seq, "\n"); 6669 seq_printf(seq, "\n");
6418 mi->event = atomic_read(&md_event_count); 6670 seq->poll_event = atomic_read(&md_event_count);
6419 return 0; 6671 return 0;
6420 } 6672 }
6421 if (v == (void*)2) { 6673 if (v == (void*)2) {
@@ -6527,26 +6779,21 @@ static const struct seq_operations md_seq_ops = {
6527 6779
6528static int md_seq_open(struct inode *inode, struct file *file) 6780static int md_seq_open(struct inode *inode, struct file *file)
6529{ 6781{
6782 struct seq_file *seq;
6530 int error; 6783 int error;
6531 struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL);
6532 if (mi == NULL)
6533 return -ENOMEM;
6534 6784
6535 error = seq_open(file, &md_seq_ops); 6785 error = seq_open(file, &md_seq_ops);
6536 if (error) 6786 if (error)
6537 kfree(mi); 6787 return error;
6538 else { 6788
6539 struct seq_file *p = file->private_data; 6789 seq = file->private_data;
6540 p->private = mi; 6790 seq->poll_event = atomic_read(&md_event_count);
6541 mi->event = atomic_read(&md_event_count);
6542 }
6543 return error; 6791 return error;
6544} 6792}
6545 6793
6546static unsigned int mdstat_poll(struct file *filp, poll_table *wait) 6794static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
6547{ 6795{
6548 struct seq_file *m = filp->private_data; 6796 struct seq_file *seq = filp->private_data;
6549 struct mdstat_info *mi = m->private;
6550 int mask; 6797 int mask;
6551 6798
6552 poll_wait(filp, &md_event_waiters, wait); 6799 poll_wait(filp, &md_event_waiters, wait);
@@ -6554,7 +6801,7 @@ static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
6554 /* always allow read */ 6801 /* always allow read */
6555 mask = POLLIN | POLLRDNORM; 6802 mask = POLLIN | POLLRDNORM;
6556 6803
6557 if (mi->event != atomic_read(&md_event_count)) 6804 if (seq->poll_event != atomic_read(&md_event_count))
6558 mask |= POLLERR | POLLPRI; 6805 mask |= POLLERR | POLLPRI;
6559 return mask; 6806 return mask;
6560} 6807}
@@ -6943,11 +7190,14 @@ void md_do_sync(mddev_t *mddev)
6943 atomic_add(sectors, &mddev->recovery_active); 7190 atomic_add(sectors, &mddev->recovery_active);
6944 } 7191 }
6945 7192
7193 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
7194 break;
7195
6946 j += sectors; 7196 j += sectors;
6947 if (j>1) mddev->curr_resync = j; 7197 if (j>1) mddev->curr_resync = j;
6948 mddev->curr_mark_cnt = io_sectors; 7198 mddev->curr_mark_cnt = io_sectors;
6949 if (last_check == 0) 7199 if (last_check == 0)
6950 /* this is the earliers that rebuilt will be 7200 /* this is the earliest that rebuild will be
6951 * visible in /proc/mdstat 7201 * visible in /proc/mdstat
6952 */ 7202 */
6953 md_new_event(mddev); 7203 md_new_event(mddev);
@@ -6956,10 +7206,6 @@ void md_do_sync(mddev_t *mddev)
6956 continue; 7206 continue;
6957 7207
6958 last_check = io_sectors; 7208 last_check = io_sectors;
6959
6960 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6961 break;
6962
6963 repeat: 7209 repeat:
6964 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 7210 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
6965 /* step marks */ 7211 /* step marks */
@@ -7077,29 +7323,23 @@ static int remove_and_add_spares(mddev_t *mddev)
7077 atomic_read(&rdev->nr_pending)==0) { 7323 atomic_read(&rdev->nr_pending)==0) {
7078 if (mddev->pers->hot_remove_disk( 7324 if (mddev->pers->hot_remove_disk(
7079 mddev, rdev->raid_disk)==0) { 7325 mddev, rdev->raid_disk)==0) {
7080 char nm[20]; 7326 sysfs_unlink_rdev(mddev, rdev);
7081 sprintf(nm,"rd%d", rdev->raid_disk);
7082 sysfs_remove_link(&mddev->kobj, nm);
7083 rdev->raid_disk = -1; 7327 rdev->raid_disk = -1;
7084 } 7328 }
7085 } 7329 }
7086 7330
7087 if (mddev->degraded && !mddev->recovery_disabled) { 7331 if (mddev->degraded) {
7088 list_for_each_entry(rdev, &mddev->disks, same_set) { 7332 list_for_each_entry(rdev, &mddev->disks, same_set) {
7089 if (rdev->raid_disk >= 0 && 7333 if (rdev->raid_disk >= 0 &&
7090 !test_bit(In_sync, &rdev->flags) && 7334 !test_bit(In_sync, &rdev->flags) &&
7091 !test_bit(Faulty, &rdev->flags) && 7335 !test_bit(Faulty, &rdev->flags))
7092 !test_bit(Blocked, &rdev->flags))
7093 spares++; 7336 spares++;
7094 if (rdev->raid_disk < 0 7337 if (rdev->raid_disk < 0
7095 && !test_bit(Faulty, &rdev->flags)) { 7338 && !test_bit(Faulty, &rdev->flags)) {
7096 rdev->recovery_offset = 0; 7339 rdev->recovery_offset = 0;
7097 if (mddev->pers-> 7340 if (mddev->pers->
7098 hot_add_disk(mddev, rdev) == 0) { 7341 hot_add_disk(mddev, rdev) == 0) {
7099 char nm[20]; 7342 if (sysfs_link_rdev(mddev, rdev))
7100 sprintf(nm, "rd%d", rdev->raid_disk);
7101 if (sysfs_create_link(&mddev->kobj,
7102 &rdev->kobj, nm))
7103 /* failure here is OK */; 7343 /* failure here is OK */;
7104 spares++; 7344 spares++;
7105 md_new_event(mddev); 7345 md_new_event(mddev);
@@ -7117,8 +7357,7 @@ static void reap_sync_thread(mddev_t *mddev)
7117 mdk_rdev_t *rdev; 7357 mdk_rdev_t *rdev;
7118 7358
7119 /* resync has finished, collect result */ 7359 /* resync has finished, collect result */
7120 md_unregister_thread(mddev->sync_thread); 7360 md_unregister_thread(&mddev->sync_thread);
7121 mddev->sync_thread = NULL;
7122 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 7361 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
7123 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 7362 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
7124 /* success...*/ 7363 /* success...*/
@@ -7148,6 +7387,8 @@ static void reap_sync_thread(mddev_t *mddev)
7148 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7387 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7149 sysfs_notify_dirent_safe(mddev->sysfs_action); 7388 sysfs_notify_dirent_safe(mddev->sysfs_action);
7150 md_new_event(mddev); 7389 md_new_event(mddev);
7390 if (mddev->event_work.func)
7391 queue_work(md_misc_wq, &mddev->event_work);
7151} 7392}
7152 7393
7153/* 7394/*
@@ -7180,9 +7421,6 @@ void md_check_recovery(mddev_t *mddev)
7180 if (mddev->bitmap) 7421 if (mddev->bitmap)
7181 bitmap_daemon_work(mddev); 7422 bitmap_daemon_work(mddev);
7182 7423
7183 if (mddev->ro)
7184 return;
7185
7186 if (signal_pending(current)) { 7424 if (signal_pending(current)) {
7187 if (mddev->pers->sync_request && !mddev->external) { 7425 if (mddev->pers->sync_request && !mddev->external) {
7188 printk(KERN_INFO "md: %s in immediate safe mode\n", 7426 printk(KERN_INFO "md: %s in immediate safe mode\n",
@@ -7219,9 +7457,7 @@ void md_check_recovery(mddev_t *mddev)
7219 atomic_read(&rdev->nr_pending)==0) { 7457 atomic_read(&rdev->nr_pending)==0) {
7220 if (mddev->pers->hot_remove_disk( 7458 if (mddev->pers->hot_remove_disk(
7221 mddev, rdev->raid_disk)==0) { 7459 mddev, rdev->raid_disk)==0) {
7222 char nm[20]; 7460 sysfs_unlink_rdev(mddev, rdev);
7223 sprintf(nm,"rd%d", rdev->raid_disk);
7224 sysfs_remove_link(&mddev->kobj, nm);
7225 rdev->raid_disk = -1; 7461 rdev->raid_disk = -1;
7226 } 7462 }
7227 } 7463 }
@@ -7341,12 +7577,499 @@ void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
7341{ 7577{
7342 sysfs_notify_dirent_safe(rdev->sysfs_state); 7578 sysfs_notify_dirent_safe(rdev->sysfs_state);
7343 wait_event_timeout(rdev->blocked_wait, 7579 wait_event_timeout(rdev->blocked_wait,
7344 !test_bit(Blocked, &rdev->flags), 7580 !test_bit(Blocked, &rdev->flags) &&
7581 !test_bit(BlockedBadBlocks, &rdev->flags),
7345 msecs_to_jiffies(5000)); 7582 msecs_to_jiffies(5000));
7346 rdev_dec_pending(rdev, mddev); 7583 rdev_dec_pending(rdev, mddev);
7347} 7584}
7348EXPORT_SYMBOL(md_wait_for_blocked_rdev); 7585EXPORT_SYMBOL(md_wait_for_blocked_rdev);
7349 7586
7587
7588/* Bad block management.
7589 * We can record which blocks on each device are 'bad' and so just
7590 * fail those blocks, or that stripe, rather than the whole device.
7591 * Entries in the bad-block table are 64bits wide. This comprises:
7592 * Length of bad-range, in sectors: 0-511 for lengths 1-512
7593 * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
7594 * A 'shift' can be set so that larger blocks are tracked and
7595 * consequently larger devices can be covered.
7596 * 'Acknowledged' flag - 1 bit. - the most significant bit.
7597 *
7598 * Locking of the bad-block table uses a seqlock so md_is_badblock
7599 * might need to retry if it is very unlucky.
7600 * We will sometimes want to check for bad blocks in a bi_end_io function,
7601 * so we use the write_seqlock_irq variant.
7602 *
7603 * When looking for a bad block we specify a range and want to
7604 * know if any block in the range is bad. So we binary-search
7605 * to the last range that starts at-or-before the given endpoint,
7606 * (or "before the sector after the target range")
7607 * then see if it ends after the given start.
7608 * We return
7609 * 0 if there are no known bad blocks in the range
7610 * 1 if there are known bad block which are all acknowledged
7611 * -1 if there are bad blocks which have not yet been acknowledged in metadata.
7612 * plus the start/length of the first bad section we overlap.
7613 */
7614int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
7615 sector_t *first_bad, int *bad_sectors)
7616{
7617 int hi;
7618 int lo = 0;
7619 u64 *p = bb->page;
7620 int rv = 0;
7621 sector_t target = s + sectors;
7622 unsigned seq;
7623
7624 if (bb->shift > 0) {
7625 /* round the start down, and the end up */
7626 s >>= bb->shift;
7627 target += (1<<bb->shift) - 1;
7628 target >>= bb->shift;
7629 sectors = target - s;
7630 }
7631 /* 'target' is now the first block after the bad range */
7632
7633retry:
7634 seq = read_seqbegin(&bb->lock);
7635
7636 hi = bb->count;
7637
7638 /* Binary search between lo and hi for 'target'
7639 * i.e. for the last range that starts before 'target'
7640 */
7641 /* INVARIANT: ranges before 'lo' and at-or-after 'hi'
7642 * are known not to be the last range before target.
7643 * VARIANT: hi-lo is the number of possible
7644 * ranges, and decreases until it reaches 1
7645 */
7646 while (hi - lo > 1) {
7647 int mid = (lo + hi) / 2;
7648 sector_t a = BB_OFFSET(p[mid]);
7649 if (a < target)
7650 /* This could still be the one, earlier ranges
7651 * could not. */
7652 lo = mid;
7653 else
7654 /* This and later ranges are definitely out. */
7655 hi = mid;
7656 }
7657 /* 'lo' might be the last that started before target, but 'hi' isn't */
7658 if (hi > lo) {
7659 /* need to check all range that end after 's' to see if
7660 * any are unacknowledged.
7661 */
7662 while (lo >= 0 &&
7663 BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
7664 if (BB_OFFSET(p[lo]) < target) {
7665 /* starts before the end, and finishes after
7666 * the start, so they must overlap
7667 */
7668 if (rv != -1 && BB_ACK(p[lo]))
7669 rv = 1;
7670 else
7671 rv = -1;
7672 *first_bad = BB_OFFSET(p[lo]);
7673 *bad_sectors = BB_LEN(p[lo]);
7674 }
7675 lo--;
7676 }
7677 }
7678
7679 if (read_seqretry(&bb->lock, seq))
7680 goto retry;
7681
7682 return rv;
7683}
7684EXPORT_SYMBOL_GPL(md_is_badblock);
7685
7686/*
7687 * Add a range of bad blocks to the table.
7688 * This might extend the table, or might contract it
7689 * if two adjacent ranges can be merged.
7690 * We binary-search to find the 'insertion' point, then
7691 * decide how best to handle it.
7692 */
7693static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
7694 int acknowledged)
7695{
7696 u64 *p;
7697 int lo, hi;
7698 int rv = 1;
7699
7700 if (bb->shift < 0)
7701 /* badblocks are disabled */
7702 return 0;
7703
7704 if (bb->shift) {
7705 /* round the start down, and the end up */
7706 sector_t next = s + sectors;
7707 s >>= bb->shift;
7708 next += (1<<bb->shift) - 1;
7709 next >>= bb->shift;
7710 sectors = next - s;
7711 }
7712
7713 write_seqlock_irq(&bb->lock);
7714
7715 p = bb->page;
7716 lo = 0;
7717 hi = bb->count;
7718 /* Find the last range that starts at-or-before 's' */
7719 while (hi - lo > 1) {
7720 int mid = (lo + hi) / 2;
7721 sector_t a = BB_OFFSET(p[mid]);
7722 if (a <= s)
7723 lo = mid;
7724 else
7725 hi = mid;
7726 }
7727 if (hi > lo && BB_OFFSET(p[lo]) > s)
7728 hi = lo;
7729
7730 if (hi > lo) {
7731 /* we found a range that might merge with the start
7732 * of our new range
7733 */
7734 sector_t a = BB_OFFSET(p[lo]);
7735 sector_t e = a + BB_LEN(p[lo]);
7736 int ack = BB_ACK(p[lo]);
7737 if (e >= s) {
7738 /* Yes, we can merge with a previous range */
7739 if (s == a && s + sectors >= e)
7740 /* new range covers old */
7741 ack = acknowledged;
7742 else
7743 ack = ack && acknowledged;
7744
7745 if (e < s + sectors)
7746 e = s + sectors;
7747 if (e - a <= BB_MAX_LEN) {
7748 p[lo] = BB_MAKE(a, e-a, ack);
7749 s = e;
7750 } else {
7751 /* does not all fit in one range,
7752 * make p[lo] maximal
7753 */
7754 if (BB_LEN(p[lo]) != BB_MAX_LEN)
7755 p[lo] = BB_MAKE(a, BB_MAX_LEN, ack);
7756 s = a + BB_MAX_LEN;
7757 }
7758 sectors = e - s;
7759 }
7760 }
7761 if (sectors && hi < bb->count) {
7762 /* 'hi' points to the first range that starts after 's'.
7763 * Maybe we can merge with the start of that range */
7764 sector_t a = BB_OFFSET(p[hi]);
7765 sector_t e = a + BB_LEN(p[hi]);
7766 int ack = BB_ACK(p[hi]);
7767 if (a <= s + sectors) {
7768 /* merging is possible */
7769 if (e <= s + sectors) {
7770 /* full overlap */
7771 e = s + sectors;
7772 ack = acknowledged;
7773 } else
7774 ack = ack && acknowledged;
7775
7776 a = s;
7777 if (e - a <= BB_MAX_LEN) {
7778 p[hi] = BB_MAKE(a, e-a, ack);
7779 s = e;
7780 } else {
7781 p[hi] = BB_MAKE(a, BB_MAX_LEN, ack);
7782 s = a + BB_MAX_LEN;
7783 }
7784 sectors = e - s;
7785 lo = hi;
7786 hi++;
7787 }
7788 }
7789 if (sectors == 0 && hi < bb->count) {
7790 /* we might be able to combine lo and hi */
7791 /* Note: 's' is at the end of 'lo' */
7792 sector_t a = BB_OFFSET(p[hi]);
7793 int lolen = BB_LEN(p[lo]);
7794 int hilen = BB_LEN(p[hi]);
7795 int newlen = lolen + hilen - (s - a);
7796 if (s >= a && newlen < BB_MAX_LEN) {
7797 /* yes, we can combine them */
7798 int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
7799 p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack);
7800 memmove(p + hi, p + hi + 1,
7801 (bb->count - hi - 1) * 8);
7802 bb->count--;
7803 }
7804 }
7805 while (sectors) {
7806 /* didn't merge (it all).
7807 * Need to add a range just before 'hi' */
7808 if (bb->count >= MD_MAX_BADBLOCKS) {
7809 /* No room for more */
7810 rv = 0;
7811 break;
7812 } else {
7813 int this_sectors = sectors;
7814 memmove(p + hi + 1, p + hi,
7815 (bb->count - hi) * 8);
7816 bb->count++;
7817
7818 if (this_sectors > BB_MAX_LEN)
7819 this_sectors = BB_MAX_LEN;
7820 p[hi] = BB_MAKE(s, this_sectors, acknowledged);
7821 sectors -= this_sectors;
7822 s += this_sectors;
7823 }
7824 }
7825
7826 bb->changed = 1;
7827 if (!acknowledged)
7828 bb->unacked_exist = 1;
7829 write_sequnlock_irq(&bb->lock);
7830
7831 return rv;
7832}
7833
7834int rdev_set_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors,
7835 int acknowledged)
7836{
7837 int rv = md_set_badblocks(&rdev->badblocks,
7838 s + rdev->data_offset, sectors, acknowledged);
7839 if (rv) {
7840 /* Make sure they get written out promptly */
7841 set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags);
7842 md_wakeup_thread(rdev->mddev->thread);
7843 }
7844 return rv;
7845}
7846EXPORT_SYMBOL_GPL(rdev_set_badblocks);
7847
7848/*
7849 * Remove a range of bad blocks from the table.
7850 * This may involve extending the table if we spilt a region,
7851 * but it must not fail. So if the table becomes full, we just
7852 * drop the remove request.
7853 */
7854static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors)
7855{
7856 u64 *p;
7857 int lo, hi;
7858 sector_t target = s + sectors;
7859 int rv = 0;
7860
7861 if (bb->shift > 0) {
7862 /* When clearing we round the start up and the end down.
7863 * This should not matter as the shift should align with
7864 * the block size and no rounding should ever be needed.
7865 * However it is better the think a block is bad when it
7866 * isn't than to think a block is not bad when it is.
7867 */
7868 s += (1<<bb->shift) - 1;
7869 s >>= bb->shift;
7870 target >>= bb->shift;
7871 sectors = target - s;
7872 }
7873
7874 write_seqlock_irq(&bb->lock);
7875
7876 p = bb->page;
7877 lo = 0;
7878 hi = bb->count;
7879 /* Find the last range that starts before 'target' */
7880 while (hi - lo > 1) {
7881 int mid = (lo + hi) / 2;
7882 sector_t a = BB_OFFSET(p[mid]);
7883 if (a < target)
7884 lo = mid;
7885 else
7886 hi = mid;
7887 }
7888 if (hi > lo) {
7889 /* p[lo] is the last range that could overlap the
7890 * current range. Earlier ranges could also overlap,
7891 * but only this one can overlap the end of the range.
7892 */
7893 if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) {
7894 /* Partial overlap, leave the tail of this range */
7895 int ack = BB_ACK(p[lo]);
7896 sector_t a = BB_OFFSET(p[lo]);
7897 sector_t end = a + BB_LEN(p[lo]);
7898
7899 if (a < s) {
7900 /* we need to split this range */
7901 if (bb->count >= MD_MAX_BADBLOCKS) {
7902 rv = 0;
7903 goto out;
7904 }
7905 memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
7906 bb->count++;
7907 p[lo] = BB_MAKE(a, s-a, ack);
7908 lo++;
7909 }
7910 p[lo] = BB_MAKE(target, end - target, ack);
7911 /* there is no longer an overlap */
7912 hi = lo;
7913 lo--;
7914 }
7915 while (lo >= 0 &&
7916 BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
7917 /* This range does overlap */
7918 if (BB_OFFSET(p[lo]) < s) {
7919 /* Keep the early parts of this range. */
7920 int ack = BB_ACK(p[lo]);
7921 sector_t start = BB_OFFSET(p[lo]);
7922 p[lo] = BB_MAKE(start, s - start, ack);
7923 /* now low doesn't overlap, so.. */
7924 break;
7925 }
7926 lo--;
7927 }
7928 /* 'lo' is strictly before, 'hi' is strictly after,
7929 * anything between needs to be discarded
7930 */
7931 if (hi - lo > 1) {
7932 memmove(p+lo+1, p+hi, (bb->count - hi) * 8);
7933 bb->count -= (hi - lo - 1);
7934 }
7935 }
7936
7937 bb->changed = 1;
7938out:
7939 write_sequnlock_irq(&bb->lock);
7940 return rv;
7941}
7942
7943int rdev_clear_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors)
7944{
7945 return md_clear_badblocks(&rdev->badblocks,
7946 s + rdev->data_offset,
7947 sectors);
7948}
7949EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
7950
7951/*
7952 * Acknowledge all bad blocks in a list.
7953 * This only succeeds if ->changed is clear. It is used by
7954 * in-kernel metadata updates
7955 */
7956void md_ack_all_badblocks(struct badblocks *bb)
7957{
7958 if (bb->page == NULL || bb->changed)
7959 /* no point even trying */
7960 return;
7961 write_seqlock_irq(&bb->lock);
7962
7963 if (bb->changed == 0) {
7964 u64 *p = bb->page;
7965 int i;
7966 for (i = 0; i < bb->count ; i++) {
7967 if (!BB_ACK(p[i])) {
7968 sector_t start = BB_OFFSET(p[i]);
7969 int len = BB_LEN(p[i]);
7970 p[i] = BB_MAKE(start, len, 1);
7971 }
7972 }
7973 bb->unacked_exist = 0;
7974 }
7975 write_sequnlock_irq(&bb->lock);
7976}
7977EXPORT_SYMBOL_GPL(md_ack_all_badblocks);
7978
7979/* sysfs access to bad-blocks list.
7980 * We present two files.
7981 * 'bad-blocks' lists sector numbers and lengths of ranges that
7982 * are recorded as bad. The list is truncated to fit within
7983 * the one-page limit of sysfs.
7984 * Writing "sector length" to this file adds an acknowledged
7985 * bad block list.
7986 * 'unacknowledged-bad-blocks' lists bad blocks that have not yet
7987 * been acknowledged. Writing to this file adds bad blocks
7988 * without acknowledging them. This is largely for testing.
7989 */
7990
7991static ssize_t
7992badblocks_show(struct badblocks *bb, char *page, int unack)
7993{
7994 size_t len;
7995 int i;
7996 u64 *p = bb->page;
7997 unsigned seq;
7998
7999 if (bb->shift < 0)
8000 return 0;
8001
8002retry:
8003 seq = read_seqbegin(&bb->lock);
8004
8005 len = 0;
8006 i = 0;
8007
8008 while (len < PAGE_SIZE && i < bb->count) {
8009 sector_t s = BB_OFFSET(p[i]);
8010 unsigned int length = BB_LEN(p[i]);
8011 int ack = BB_ACK(p[i]);
8012 i++;
8013
8014 if (unack && ack)
8015 continue;
8016
8017 len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n",
8018 (unsigned long long)s << bb->shift,
8019 length << bb->shift);
8020 }
8021 if (unack && len == 0)
8022 bb->unacked_exist = 0;
8023
8024 if (read_seqretry(&bb->lock, seq))
8025 goto retry;
8026
8027 return len;
8028}
8029
8030#define DO_DEBUG 1
8031
8032static ssize_t
8033badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack)
8034{
8035 unsigned long long sector;
8036 int length;
8037 char newline;
8038#ifdef DO_DEBUG
8039 /* Allow clearing via sysfs *only* for testing/debugging.
8040 * Normally only a successful write may clear a badblock
8041 */
8042 int clear = 0;
8043 if (page[0] == '-') {
8044 clear = 1;
8045 page++;
8046 }
8047#endif /* DO_DEBUG */
8048
8049 switch (sscanf(page, "%llu %d%c", &sector, &length, &newline)) {
8050 case 3:
8051 if (newline != '\n')
8052 return -EINVAL;
8053 case 2:
8054 if (length <= 0)
8055 return -EINVAL;
8056 break;
8057 default:
8058 return -EINVAL;
8059 }
8060
8061#ifdef DO_DEBUG
8062 if (clear) {
8063 md_clear_badblocks(bb, sector, length);
8064 return len;
8065 }
8066#endif /* DO_DEBUG */
8067 if (md_set_badblocks(bb, sector, length, !unack))
8068 return len;
8069 else
8070 return -ENOSPC;
8071}
8072
7350static int md_notify_reboot(struct notifier_block *this, 8073static int md_notify_reboot(struct notifier_block *this,
7351 unsigned long code, void *x) 8074 unsigned long code, void *x)
7352{ 8075{
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 1c26c7a08ae..0a309dc29b4 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -29,6 +29,13 @@
29typedef struct mddev_s mddev_t; 29typedef struct mddev_s mddev_t;
30typedef struct mdk_rdev_s mdk_rdev_t; 30typedef struct mdk_rdev_s mdk_rdev_t;
31 31
32/* Bad block numbers are stored sorted in a single page.
33 * 64bits is used for each block or extent.
34 * 54 bits are sector number, 9 bits are extent size,
35 * 1 bit is an 'acknowledged' flag.
36 */
37#define MD_MAX_BADBLOCKS (PAGE_SIZE/8)
38
32/* 39/*
33 * MD's 'extended' device 40 * MD's 'extended' device
34 */ 41 */
@@ -48,7 +55,7 @@ struct mdk_rdev_s
48 struct block_device *meta_bdev; 55 struct block_device *meta_bdev;
49 struct block_device *bdev; /* block device handle */ 56 struct block_device *bdev; /* block device handle */
50 57
51 struct page *sb_page; 58 struct page *sb_page, *bb_page;
52 int sb_loaded; 59 int sb_loaded;
53 __u64 sb_events; 60 __u64 sb_events;
54 sector_t data_offset; /* start of data in array */ 61 sector_t data_offset; /* start of data in array */
@@ -74,9 +81,29 @@ struct mdk_rdev_s
74#define In_sync 2 /* device is in_sync with rest of array */ 81#define In_sync 2 /* device is in_sync with rest of array */
75#define WriteMostly 4 /* Avoid reading if at all possible */ 82#define WriteMostly 4 /* Avoid reading if at all possible */
76#define AutoDetected 7 /* added by auto-detect */ 83#define AutoDetected 7 /* added by auto-detect */
77#define Blocked 8 /* An error occurred on an externally 84#define Blocked 8 /* An error occurred but has not yet
78 * managed array, don't allow writes 85 * been acknowledged by the metadata
86 * handler, so don't allow writes
79 * until it is cleared */ 87 * until it is cleared */
88#define WriteErrorSeen 9 /* A write error has been seen on this
89 * device
90 */
91#define FaultRecorded 10 /* Intermediate state for clearing
92 * Blocked. The Fault is/will-be
93 * recorded in the metadata, but that
94 * metadata hasn't been stored safely
95 * on disk yet.
96 */
97#define BlockedBadBlocks 11 /* A writer is blocked because they
98 * found an unacknowledged bad-block.
99 * This can safely be cleared at any
100 * time, and the writer will re-check.
101 * It may be set at any time, and at
102 * worst the writer will timeout and
103 * re-check. So setting it as
104 * accurately as possible is good, but
105 * not absolutely critical.
106 */
80 wait_queue_head_t blocked_wait; 107 wait_queue_head_t blocked_wait;
81 108
82 int desc_nr; /* descriptor index in the superblock */ 109 int desc_nr; /* descriptor index in the superblock */
@@ -111,8 +138,54 @@ struct mdk_rdev_s
111 138
112 struct sysfs_dirent *sysfs_state; /* handle for 'state' 139 struct sysfs_dirent *sysfs_state; /* handle for 'state'
113 * sysfs entry */ 140 * sysfs entry */
141
142 struct badblocks {
143 int count; /* count of bad blocks */
144 int unacked_exist; /* there probably are unacknowledged
145 * bad blocks. This is only cleared
146 * when a read discovers none
147 */
148 int shift; /* shift from sectors to block size
149 * a -ve shift means badblocks are
150 * disabled.*/
151 u64 *page; /* badblock list */
152 int changed;
153 seqlock_t lock;
154
155 sector_t sector;
156 sector_t size; /* in sectors */
157 } badblocks;
114}; 158};
115 159
160#define BB_LEN_MASK (0x00000000000001FFULL)
161#define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL)
162#define BB_ACK_MASK (0x8000000000000000ULL)
163#define BB_MAX_LEN 512
164#define BB_OFFSET(x) (((x) & BB_OFFSET_MASK) >> 9)
165#define BB_LEN(x) (((x) & BB_LEN_MASK) + 1)
166#define BB_ACK(x) (!!((x) & BB_ACK_MASK))
167#define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63))
168
169extern int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
170 sector_t *first_bad, int *bad_sectors);
171static inline int is_badblock(mdk_rdev_t *rdev, sector_t s, int sectors,
172 sector_t *first_bad, int *bad_sectors)
173{
174 if (unlikely(rdev->badblocks.count)) {
175 int rv = md_is_badblock(&rdev->badblocks, rdev->data_offset + s,
176 sectors,
177 first_bad, bad_sectors);
178 if (rv)
179 *first_bad -= rdev->data_offset;
180 return rv;
181 }
182 return 0;
183}
184extern int rdev_set_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors,
185 int acknowledged);
186extern int rdev_clear_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors);
187extern void md_ack_all_badblocks(struct badblocks *bb);
188
116struct mddev_s 189struct mddev_s
117{ 190{
118 void *private; 191 void *private;
@@ -239,9 +312,12 @@ struct mddev_s
239#define MD_RECOVERY_FROZEN 9 312#define MD_RECOVERY_FROZEN 9
240 313
241 unsigned long recovery; 314 unsigned long recovery;
242 int recovery_disabled; /* if we detect that recovery 315 /* If a RAID personality determines that recovery (of a particular
243 * will always fail, set this 316 * device) will fail due to a read error on the source device, it
244 * so we don't loop trying */ 317 * takes a copy of this number and does not attempt recovery again
318 * until this number changes.
319 */
320 int recovery_disabled;
245 321
246 int in_sync; /* know to not need resync */ 322 int in_sync; /* know to not need resync */
247 /* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so 323 /* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so
@@ -304,11 +380,6 @@ struct mddev_s
304 * hot-adding a bitmap. It should 380 * hot-adding a bitmap. It should
305 * eventually be settable by sysfs. 381 * eventually be settable by sysfs.
306 */ 382 */
307 /* When md is serving under dm, it might use a
308 * dirty_log to store the bits.
309 */
310 struct dm_dirty_log *log;
311
312 struct mutex mutex; 383 struct mutex mutex;
313 unsigned long chunksize; 384 unsigned long chunksize;
314 unsigned long daemon_sleep; /* how many jiffies between updates? */ 385 unsigned long daemon_sleep; /* how many jiffies between updates? */
@@ -413,6 +484,20 @@ static inline char * mdname (mddev_t * mddev)
413 return mddev->gendisk ? mddev->gendisk->disk_name : "mdX"; 484 return mddev->gendisk ? mddev->gendisk->disk_name : "mdX";
414} 485}
415 486
487static inline int sysfs_link_rdev(mddev_t *mddev, mdk_rdev_t *rdev)
488{
489 char nm[20];
490 sprintf(nm, "rd%d", rdev->raid_disk);
491 return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
492}
493
494static inline void sysfs_unlink_rdev(mddev_t *mddev, mdk_rdev_t *rdev)
495{
496 char nm[20];
497 sprintf(nm, "rd%d", rdev->raid_disk);
498 sysfs_remove_link(&mddev->kobj, nm);
499}
500
416/* 501/*
417 * iterates through some rdev ringlist. It's safe to remove the 502 * iterates through some rdev ringlist. It's safe to remove the
418 * current 'rdev'. Dont touch 'tmp' though. 503 * current 'rdev'. Dont touch 'tmp' though.
@@ -475,7 +560,7 @@ extern int register_md_personality(struct mdk_personality *p);
475extern int unregister_md_personality(struct mdk_personality *p); 560extern int unregister_md_personality(struct mdk_personality *p);
476extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev), 561extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev),
477 mddev_t *mddev, const char *name); 562 mddev_t *mddev, const char *name);
478extern void md_unregister_thread(mdk_thread_t *thread); 563extern void md_unregister_thread(mdk_thread_t **threadp);
479extern void md_wakeup_thread(mdk_thread_t *thread); 564extern void md_wakeup_thread(mdk_thread_t *thread);
480extern void md_check_recovery(mddev_t *mddev); 565extern void md_check_recovery(mddev_t *mddev);
481extern void md_write_start(mddev_t *mddev, struct bio *bi); 566extern void md_write_start(mddev_t *mddev, struct bio *bi);
@@ -505,7 +590,7 @@ extern void mddev_init(mddev_t *mddev);
505extern int md_run(mddev_t *mddev); 590extern int md_run(mddev_t *mddev);
506extern void md_stop(mddev_t *mddev); 591extern void md_stop(mddev_t *mddev);
507extern void md_stop_writes(mddev_t *mddev); 592extern void md_stop_writes(mddev_t *mddev);
508extern void md_rdev_init(mdk_rdev_t *rdev); 593extern int md_rdev_init(mdk_rdev_t *rdev);
509 594
510extern void mddev_suspend(mddev_t *mddev); 595extern void mddev_suspend(mddev_t *mddev);
511extern void mddev_resume(mddev_t *mddev); 596extern void mddev_resume(mddev_t *mddev);
@@ -514,4 +599,5 @@ extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
514extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, 599extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
515 mddev_t *mddev); 600 mddev_t *mddev);
516extern int mddev_check_plugged(mddev_t *mddev); 601extern int mddev_check_plugged(mddev_t *mddev);
602extern void md_trim_bio(struct bio *bio, int offset, int size);
517#endif /* _MD_MD_H */ 603#endif /* _MD_MD_H */
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 3535c23af28..d5b5fb30017 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -514,8 +514,7 @@ static int multipath_stop (mddev_t *mddev)
514{ 514{
515 multipath_conf_t *conf = mddev->private; 515 multipath_conf_t *conf = mddev->private;
516 516
517 md_unregister_thread(mddev->thread); 517 md_unregister_thread(&mddev->thread);
518 mddev->thread = NULL;
519 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 518 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
520 mempool_destroy(conf->pool); 519 mempool_destroy(conf->pool);
521 kfree(conf->multipaths); 520 kfree(conf->multipaths);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index f7431b6d844..606fc04fd76 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -35,16 +35,13 @@
35#include <linux/delay.h> 35#include <linux/delay.h>
36#include <linux/blkdev.h> 36#include <linux/blkdev.h>
37#include <linux/seq_file.h> 37#include <linux/seq_file.h>
38#include <linux/ratelimit.h>
38#include "md.h" 39#include "md.h"
39#include "raid1.h" 40#include "raid1.h"
40#include "bitmap.h" 41#include "bitmap.h"
41 42
42#define DEBUG 0 43#define DEBUG 0
43#if DEBUG 44#define PRINTK(x...) do { if (DEBUG) printk(x); } while (0)
44#define PRINTK(x...) printk(x)
45#else
46#define PRINTK(x...)
47#endif
48 45
49/* 46/*
50 * Number of guaranteed r1bios in case of extreme VM load: 47 * Number of guaranteed r1bios in case of extreme VM load:
@@ -166,7 +163,7 @@ static void put_all_bios(conf_t *conf, r1bio_t *r1_bio)
166 163
167 for (i = 0; i < conf->raid_disks; i++) { 164 for (i = 0; i < conf->raid_disks; i++) {
168 struct bio **bio = r1_bio->bios + i; 165 struct bio **bio = r1_bio->bios + i;
169 if (*bio && *bio != IO_BLOCKED) 166 if (!BIO_SPECIAL(*bio))
170 bio_put(*bio); 167 bio_put(*bio);
171 *bio = NULL; 168 *bio = NULL;
172 } 169 }
@@ -176,12 +173,6 @@ static void free_r1bio(r1bio_t *r1_bio)
176{ 173{
177 conf_t *conf = r1_bio->mddev->private; 174 conf_t *conf = r1_bio->mddev->private;
178 175
179 /*
180 * Wake up any possible resync thread that waits for the device
181 * to go idle.
182 */
183 allow_barrier(conf);
184
185 put_all_bios(conf, r1_bio); 176 put_all_bios(conf, r1_bio);
186 mempool_free(r1_bio, conf->r1bio_pool); 177 mempool_free(r1_bio, conf->r1bio_pool);
187} 178}
@@ -222,6 +213,33 @@ static void reschedule_retry(r1bio_t *r1_bio)
222 * operation and are ready to return a success/failure code to the buffer 213 * operation and are ready to return a success/failure code to the buffer
223 * cache layer. 214 * cache layer.
224 */ 215 */
216static void call_bio_endio(r1bio_t *r1_bio)
217{
218 struct bio *bio = r1_bio->master_bio;
219 int done;
220 conf_t *conf = r1_bio->mddev->private;
221
222 if (bio->bi_phys_segments) {
223 unsigned long flags;
224 spin_lock_irqsave(&conf->device_lock, flags);
225 bio->bi_phys_segments--;
226 done = (bio->bi_phys_segments == 0);
227 spin_unlock_irqrestore(&conf->device_lock, flags);
228 } else
229 done = 1;
230
231 if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
232 clear_bit(BIO_UPTODATE, &bio->bi_flags);
233 if (done) {
234 bio_endio(bio, 0);
235 /*
236 * Wake up any possible resync thread that waits for the device
237 * to go idle.
238 */
239 allow_barrier(conf);
240 }
241}
242
225static void raid_end_bio_io(r1bio_t *r1_bio) 243static void raid_end_bio_io(r1bio_t *r1_bio)
226{ 244{
227 struct bio *bio = r1_bio->master_bio; 245 struct bio *bio = r1_bio->master_bio;
@@ -234,8 +252,7 @@ static void raid_end_bio_io(r1bio_t *r1_bio)
234 (unsigned long long) bio->bi_sector + 252 (unsigned long long) bio->bi_sector +
235 (bio->bi_size >> 9) - 1); 253 (bio->bi_size >> 9) - 1);
236 254
237 bio_endio(bio, 255 call_bio_endio(r1_bio);
238 test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO);
239 } 256 }
240 free_r1bio(r1_bio); 257 free_r1bio(r1_bio);
241} 258}
@@ -287,36 +304,52 @@ static void raid1_end_read_request(struct bio *bio, int error)
287 * oops, read error: 304 * oops, read error:
288 */ 305 */
289 char b[BDEVNAME_SIZE]; 306 char b[BDEVNAME_SIZE];
290 if (printk_ratelimit()) 307 printk_ratelimited(
291 printk(KERN_ERR "md/raid1:%s: %s: rescheduling sector %llu\n", 308 KERN_ERR "md/raid1:%s: %s: "
292 mdname(conf->mddev), 309 "rescheduling sector %llu\n",
293 bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector); 310 mdname(conf->mddev),
311 bdevname(conf->mirrors[mirror].rdev->bdev,
312 b),
313 (unsigned long long)r1_bio->sector);
314 set_bit(R1BIO_ReadError, &r1_bio->state);
294 reschedule_retry(r1_bio); 315 reschedule_retry(r1_bio);
295 } 316 }
296 317
297 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); 318 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
298} 319}
299 320
321static void close_write(r1bio_t *r1_bio)
322{
323 /* it really is the end of this request */
324 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
325 /* free extra copy of the data pages */
326 int i = r1_bio->behind_page_count;
327 while (i--)
328 safe_put_page(r1_bio->behind_bvecs[i].bv_page);
329 kfree(r1_bio->behind_bvecs);
330 r1_bio->behind_bvecs = NULL;
331 }
332 /* clear the bitmap if all writes complete successfully */
333 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
334 r1_bio->sectors,
335 !test_bit(R1BIO_Degraded, &r1_bio->state),
336 test_bit(R1BIO_BehindIO, &r1_bio->state));
337 md_write_end(r1_bio->mddev);
338}
339
300static void r1_bio_write_done(r1bio_t *r1_bio) 340static void r1_bio_write_done(r1bio_t *r1_bio)
301{ 341{
302 if (atomic_dec_and_test(&r1_bio->remaining)) 342 if (!atomic_dec_and_test(&r1_bio->remaining))
303 { 343 return;
304 /* it really is the end of this request */ 344
305 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { 345 if (test_bit(R1BIO_WriteError, &r1_bio->state))
306 /* free extra copy of the data pages */ 346 reschedule_retry(r1_bio);
307 int i = r1_bio->behind_page_count; 347 else {
308 while (i--) 348 close_write(r1_bio);
309 safe_put_page(r1_bio->behind_pages[i]); 349 if (test_bit(R1BIO_MadeGood, &r1_bio->state))
310 kfree(r1_bio->behind_pages); 350 reschedule_retry(r1_bio);
311 r1_bio->behind_pages = NULL; 351 else
312 } 352 raid_end_bio_io(r1_bio);
313 /* clear the bitmap if all writes complete successfully */
314 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
315 r1_bio->sectors,
316 !test_bit(R1BIO_Degraded, &r1_bio->state),
317 test_bit(R1BIO_BehindIO, &r1_bio->state));
318 md_write_end(r1_bio->mddev);
319 raid_end_bio_io(r1_bio);
320 } 353 }
321} 354}
322 355
@@ -336,13 +369,11 @@ static void raid1_end_write_request(struct bio *bio, int error)
336 /* 369 /*
337 * 'one mirror IO has finished' event handler: 370 * 'one mirror IO has finished' event handler:
338 */ 371 */
339 r1_bio->bios[mirror] = NULL;
340 to_put = bio;
341 if (!uptodate) { 372 if (!uptodate) {
342 md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); 373 set_bit(WriteErrorSeen,
343 /* an I/O failed, we can't clear the bitmap */ 374 &conf->mirrors[mirror].rdev->flags);
344 set_bit(R1BIO_Degraded, &r1_bio->state); 375 set_bit(R1BIO_WriteError, &r1_bio->state);
345 } else 376 } else {
346 /* 377 /*
347 * Set R1BIO_Uptodate in our master bio, so that we 378 * Set R1BIO_Uptodate in our master bio, so that we
348 * will return a good error code for to the higher 379 * will return a good error code for to the higher
@@ -353,8 +384,22 @@ static void raid1_end_write_request(struct bio *bio, int error)
353 * to user-side. So if something waits for IO, then it 384 * to user-side. So if something waits for IO, then it
354 * will wait for the 'master' bio. 385 * will wait for the 'master' bio.
355 */ 386 */
387 sector_t first_bad;
388 int bad_sectors;
389
390 r1_bio->bios[mirror] = NULL;
391 to_put = bio;
356 set_bit(R1BIO_Uptodate, &r1_bio->state); 392 set_bit(R1BIO_Uptodate, &r1_bio->state);
357 393
394 /* Maybe we can clear some bad blocks. */
395 if (is_badblock(conf->mirrors[mirror].rdev,
396 r1_bio->sector, r1_bio->sectors,
397 &first_bad, &bad_sectors)) {
398 r1_bio->bios[mirror] = IO_MADE_GOOD;
399 set_bit(R1BIO_MadeGood, &r1_bio->state);
400 }
401 }
402
358 update_head_pos(mirror, r1_bio); 403 update_head_pos(mirror, r1_bio);
359 404
360 if (behind) { 405 if (behind) {
@@ -377,11 +422,13 @@ static void raid1_end_write_request(struct bio *bio, int error)
377 (unsigned long long) mbio->bi_sector, 422 (unsigned long long) mbio->bi_sector,
378 (unsigned long long) mbio->bi_sector + 423 (unsigned long long) mbio->bi_sector +
379 (mbio->bi_size >> 9) - 1); 424 (mbio->bi_size >> 9) - 1);
380 bio_endio(mbio, 0); 425 call_bio_endio(r1_bio);
381 } 426 }
382 } 427 }
383 } 428 }
384 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); 429 if (r1_bio->bios[mirror] == NULL)
430 rdev_dec_pending(conf->mirrors[mirror].rdev,
431 conf->mddev);
385 432
386 /* 433 /*
387 * Let's see if all mirrored write operations have finished 434 * Let's see if all mirrored write operations have finished
@@ -408,10 +455,11 @@ static void raid1_end_write_request(struct bio *bio, int error)
408 * 455 *
409 * The rdev for the device selected will have nr_pending incremented. 456 * The rdev for the device selected will have nr_pending incremented.
410 */ 457 */
411static int read_balance(conf_t *conf, r1bio_t *r1_bio) 458static int read_balance(conf_t *conf, r1bio_t *r1_bio, int *max_sectors)
412{ 459{
413 const sector_t this_sector = r1_bio->sector; 460 const sector_t this_sector = r1_bio->sector;
414 const int sectors = r1_bio->sectors; 461 int sectors;
462 int best_good_sectors;
415 int start_disk; 463 int start_disk;
416 int best_disk; 464 int best_disk;
417 int i; 465 int i;
@@ -426,8 +474,11 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
426 * We take the first readable disk when above the resync window. 474 * We take the first readable disk when above the resync window.
427 */ 475 */
428 retry: 476 retry:
477 sectors = r1_bio->sectors;
429 best_disk = -1; 478 best_disk = -1;
430 best_dist = MaxSector; 479 best_dist = MaxSector;
480 best_good_sectors = 0;
481
431 if (conf->mddev->recovery_cp < MaxSector && 482 if (conf->mddev->recovery_cp < MaxSector &&
432 (this_sector + sectors >= conf->next_resync)) { 483 (this_sector + sectors >= conf->next_resync)) {
433 choose_first = 1; 484 choose_first = 1;
@@ -439,6 +490,9 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
439 490
440 for (i = 0 ; i < conf->raid_disks ; i++) { 491 for (i = 0 ; i < conf->raid_disks ; i++) {
441 sector_t dist; 492 sector_t dist;
493 sector_t first_bad;
494 int bad_sectors;
495
442 int disk = start_disk + i; 496 int disk = start_disk + i;
443 if (disk >= conf->raid_disks) 497 if (disk >= conf->raid_disks)
444 disk -= conf->raid_disks; 498 disk -= conf->raid_disks;
@@ -454,13 +508,51 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
454 if (test_bit(WriteMostly, &rdev->flags)) { 508 if (test_bit(WriteMostly, &rdev->flags)) {
455 /* Don't balance among write-mostly, just 509 /* Don't balance among write-mostly, just
456 * use the first as a last resort */ 510 * use the first as a last resort */
457 if (best_disk < 0) 511 if (best_disk < 0) {
512 if (is_badblock(rdev, this_sector, sectors,
513 &first_bad, &bad_sectors)) {
514 if (first_bad < this_sector)
515 /* Cannot use this */
516 continue;
517 best_good_sectors = first_bad - this_sector;
518 } else
519 best_good_sectors = sectors;
458 best_disk = disk; 520 best_disk = disk;
521 }
459 continue; 522 continue;
460 } 523 }
461 /* This is a reasonable device to use. It might 524 /* This is a reasonable device to use. It might
462 * even be best. 525 * even be best.
463 */ 526 */
527 if (is_badblock(rdev, this_sector, sectors,
528 &first_bad, &bad_sectors)) {
529 if (best_dist < MaxSector)
530 /* already have a better device */
531 continue;
532 if (first_bad <= this_sector) {
533 /* cannot read here. If this is the 'primary'
534 * device, then we must not read beyond
535 * bad_sectors from another device..
536 */
537 bad_sectors -= (this_sector - first_bad);
538 if (choose_first && sectors > bad_sectors)
539 sectors = bad_sectors;
540 if (best_good_sectors > sectors)
541 best_good_sectors = sectors;
542
543 } else {
544 sector_t good_sectors = first_bad - this_sector;
545 if (good_sectors > best_good_sectors) {
546 best_good_sectors = good_sectors;
547 best_disk = disk;
548 }
549 if (choose_first)
550 break;
551 }
552 continue;
553 } else
554 best_good_sectors = sectors;
555
464 dist = abs(this_sector - conf->mirrors[disk].head_position); 556 dist = abs(this_sector - conf->mirrors[disk].head_position);
465 if (choose_first 557 if (choose_first
466 /* Don't change to another disk for sequential reads */ 558 /* Don't change to another disk for sequential reads */
@@ -489,10 +581,12 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
489 rdev_dec_pending(rdev, conf->mddev); 581 rdev_dec_pending(rdev, conf->mddev);
490 goto retry; 582 goto retry;
491 } 583 }
584 sectors = best_good_sectors;
492 conf->next_seq_sect = this_sector + sectors; 585 conf->next_seq_sect = this_sector + sectors;
493 conf->last_used = best_disk; 586 conf->last_used = best_disk;
494 } 587 }
495 rcu_read_unlock(); 588 rcu_read_unlock();
589 *max_sectors = sectors;
496 590
497 return best_disk; 591 return best_disk;
498} 592}
@@ -672,30 +766,31 @@ static void alloc_behind_pages(struct bio *bio, r1bio_t *r1_bio)
672{ 766{
673 int i; 767 int i;
674 struct bio_vec *bvec; 768 struct bio_vec *bvec;
675 struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page*), 769 struct bio_vec *bvecs = kzalloc(bio->bi_vcnt * sizeof(struct bio_vec),
676 GFP_NOIO); 770 GFP_NOIO);
677 if (unlikely(!pages)) 771 if (unlikely(!bvecs))
678 return; 772 return;
679 773
680 bio_for_each_segment(bvec, bio, i) { 774 bio_for_each_segment(bvec, bio, i) {
681 pages[i] = alloc_page(GFP_NOIO); 775 bvecs[i] = *bvec;
682 if (unlikely(!pages[i])) 776 bvecs[i].bv_page = alloc_page(GFP_NOIO);
777 if (unlikely(!bvecs[i].bv_page))
683 goto do_sync_io; 778 goto do_sync_io;
684 memcpy(kmap(pages[i]) + bvec->bv_offset, 779 memcpy(kmap(bvecs[i].bv_page) + bvec->bv_offset,
685 kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len); 780 kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len);
686 kunmap(pages[i]); 781 kunmap(bvecs[i].bv_page);
687 kunmap(bvec->bv_page); 782 kunmap(bvec->bv_page);
688 } 783 }
689 r1_bio->behind_pages = pages; 784 r1_bio->behind_bvecs = bvecs;
690 r1_bio->behind_page_count = bio->bi_vcnt; 785 r1_bio->behind_page_count = bio->bi_vcnt;
691 set_bit(R1BIO_BehindIO, &r1_bio->state); 786 set_bit(R1BIO_BehindIO, &r1_bio->state);
692 return; 787 return;
693 788
694do_sync_io: 789do_sync_io:
695 for (i = 0; i < bio->bi_vcnt; i++) 790 for (i = 0; i < bio->bi_vcnt; i++)
696 if (pages[i]) 791 if (bvecs[i].bv_page)
697 put_page(pages[i]); 792 put_page(bvecs[i].bv_page);
698 kfree(pages); 793 kfree(bvecs);
699 PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); 794 PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
700} 795}
701 796
@@ -705,7 +800,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
705 mirror_info_t *mirror; 800 mirror_info_t *mirror;
706 r1bio_t *r1_bio; 801 r1bio_t *r1_bio;
707 struct bio *read_bio; 802 struct bio *read_bio;
708 int i, targets = 0, disks; 803 int i, disks;
709 struct bitmap *bitmap; 804 struct bitmap *bitmap;
710 unsigned long flags; 805 unsigned long flags;
711 const int rw = bio_data_dir(bio); 806 const int rw = bio_data_dir(bio);
@@ -713,6 +808,9 @@ static int make_request(mddev_t *mddev, struct bio * bio)
713 const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); 808 const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
714 mdk_rdev_t *blocked_rdev; 809 mdk_rdev_t *blocked_rdev;
715 int plugged; 810 int plugged;
811 int first_clone;
812 int sectors_handled;
813 int max_sectors;
716 814
717 /* 815 /*
718 * Register the new request and wait if the reconstruction 816 * Register the new request and wait if the reconstruction
@@ -759,11 +857,24 @@ static int make_request(mddev_t *mddev, struct bio * bio)
759 r1_bio->mddev = mddev; 857 r1_bio->mddev = mddev;
760 r1_bio->sector = bio->bi_sector; 858 r1_bio->sector = bio->bi_sector;
761 859
860 /* We might need to issue multiple reads to different
861 * devices if there are bad blocks around, so we keep
862 * track of the number of reads in bio->bi_phys_segments.
863 * If this is 0, there is only one r1_bio and no locking
864 * will be needed when requests complete. If it is
865 * non-zero, then it is the number of not-completed requests.
866 */
867 bio->bi_phys_segments = 0;
868 clear_bit(BIO_SEG_VALID, &bio->bi_flags);
869
762 if (rw == READ) { 870 if (rw == READ) {
763 /* 871 /*
764 * read balancing logic: 872 * read balancing logic:
765 */ 873 */
766 int rdisk = read_balance(conf, r1_bio); 874 int rdisk;
875
876read_again:
877 rdisk = read_balance(conf, r1_bio, &max_sectors);
767 878
768 if (rdisk < 0) { 879 if (rdisk < 0) {
769 /* couldn't find anywhere to read from */ 880 /* couldn't find anywhere to read from */
@@ -784,6 +895,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
784 r1_bio->read_disk = rdisk; 895 r1_bio->read_disk = rdisk;
785 896
786 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); 897 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
898 md_trim_bio(read_bio, r1_bio->sector - bio->bi_sector,
899 max_sectors);
787 900
788 r1_bio->bios[rdisk] = read_bio; 901 r1_bio->bios[rdisk] = read_bio;
789 902
@@ -793,16 +906,52 @@ static int make_request(mddev_t *mddev, struct bio * bio)
793 read_bio->bi_rw = READ | do_sync; 906 read_bio->bi_rw = READ | do_sync;
794 read_bio->bi_private = r1_bio; 907 read_bio->bi_private = r1_bio;
795 908
796 generic_make_request(read_bio); 909 if (max_sectors < r1_bio->sectors) {
910 /* could not read all from this device, so we will
911 * need another r1_bio.
912 */
913
914 sectors_handled = (r1_bio->sector + max_sectors
915 - bio->bi_sector);
916 r1_bio->sectors = max_sectors;
917 spin_lock_irq(&conf->device_lock);
918 if (bio->bi_phys_segments == 0)
919 bio->bi_phys_segments = 2;
920 else
921 bio->bi_phys_segments++;
922 spin_unlock_irq(&conf->device_lock);
923 /* Cannot call generic_make_request directly
924 * as that will be queued in __make_request
925 * and subsequent mempool_alloc might block waiting
926 * for it. So hand bio over to raid1d.
927 */
928 reschedule_retry(r1_bio);
929
930 r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
931
932 r1_bio->master_bio = bio;
933 r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
934 r1_bio->state = 0;
935 r1_bio->mddev = mddev;
936 r1_bio->sector = bio->bi_sector + sectors_handled;
937 goto read_again;
938 } else
939 generic_make_request(read_bio);
797 return 0; 940 return 0;
798 } 941 }
799 942
800 /* 943 /*
801 * WRITE: 944 * WRITE:
802 */ 945 */
803 /* first select target devices under spinlock and 946 /* first select target devices under rcu_lock and
804 * inc refcount on their rdev. Record them by setting 947 * inc refcount on their rdev. Record them by setting
805 * bios[x] to bio 948 * bios[x] to bio
949 * If there are known/acknowledged bad blocks on any device on
950 * which we have seen a write error, we want to avoid writing those
951 * blocks.
952 * This potentially requires several writes to write around
953 * the bad blocks. Each set of writes gets it's own r1bio
954 * with a set of bios attached.
806 */ 955 */
807 plugged = mddev_check_plugged(mddev); 956 plugged = mddev_check_plugged(mddev);
808 957
@@ -810,6 +959,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
810 retry_write: 959 retry_write:
811 blocked_rdev = NULL; 960 blocked_rdev = NULL;
812 rcu_read_lock(); 961 rcu_read_lock();
962 max_sectors = r1_bio->sectors;
813 for (i = 0; i < disks; i++) { 963 for (i = 0; i < disks; i++) {
814 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); 964 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
815 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { 965 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
@@ -817,17 +967,56 @@ static int make_request(mddev_t *mddev, struct bio * bio)
817 blocked_rdev = rdev; 967 blocked_rdev = rdev;
818 break; 968 break;
819 } 969 }
820 if (rdev && !test_bit(Faulty, &rdev->flags)) { 970 r1_bio->bios[i] = NULL;
821 atomic_inc(&rdev->nr_pending); 971 if (!rdev || test_bit(Faulty, &rdev->flags)) {
822 if (test_bit(Faulty, &rdev->flags)) { 972 set_bit(R1BIO_Degraded, &r1_bio->state);
973 continue;
974 }
975
976 atomic_inc(&rdev->nr_pending);
977 if (test_bit(WriteErrorSeen, &rdev->flags)) {
978 sector_t first_bad;
979 int bad_sectors;
980 int is_bad;
981
982 is_bad = is_badblock(rdev, r1_bio->sector,
983 max_sectors,
984 &first_bad, &bad_sectors);
985 if (is_bad < 0) {
986 /* mustn't write here until the bad block is
987 * acknowledged*/
988 set_bit(BlockedBadBlocks, &rdev->flags);
989 blocked_rdev = rdev;
990 break;
991 }
992 if (is_bad && first_bad <= r1_bio->sector) {
993 /* Cannot write here at all */
994 bad_sectors -= (r1_bio->sector - first_bad);
995 if (bad_sectors < max_sectors)
996 /* mustn't write more than bad_sectors
997 * to other devices yet
998 */
999 max_sectors = bad_sectors;
823 rdev_dec_pending(rdev, mddev); 1000 rdev_dec_pending(rdev, mddev);
824 r1_bio->bios[i] = NULL; 1001 /* We don't set R1BIO_Degraded as that
825 } else { 1002 * only applies if the disk is
826 r1_bio->bios[i] = bio; 1003 * missing, so it might be re-added,
827 targets++; 1004 * and we want to know to recover this
1005 * chunk.
1006 * In this case the device is here,
1007 * and the fact that this chunk is not
1008 * in-sync is recorded in the bad
1009 * block log
1010 */
1011 continue;
828 } 1012 }
829 } else 1013 if (is_bad) {
830 r1_bio->bios[i] = NULL; 1014 int good_sectors = first_bad - r1_bio->sector;
1015 if (good_sectors < max_sectors)
1016 max_sectors = good_sectors;
1017 }
1018 }
1019 r1_bio->bios[i] = bio;
831 } 1020 }
832 rcu_read_unlock(); 1021 rcu_read_unlock();
833 1022
@@ -838,51 +1027,57 @@ static int make_request(mddev_t *mddev, struct bio * bio)
838 for (j = 0; j < i; j++) 1027 for (j = 0; j < i; j++)
839 if (r1_bio->bios[j]) 1028 if (r1_bio->bios[j])
840 rdev_dec_pending(conf->mirrors[j].rdev, mddev); 1029 rdev_dec_pending(conf->mirrors[j].rdev, mddev);
841 1030 r1_bio->state = 0;
842 allow_barrier(conf); 1031 allow_barrier(conf);
843 md_wait_for_blocked_rdev(blocked_rdev, mddev); 1032 md_wait_for_blocked_rdev(blocked_rdev, mddev);
844 wait_barrier(conf); 1033 wait_barrier(conf);
845 goto retry_write; 1034 goto retry_write;
846 } 1035 }
847 1036
848 BUG_ON(targets == 0); /* we never fail the last device */ 1037 if (max_sectors < r1_bio->sectors) {
849 1038 /* We are splitting this write into multiple parts, so
850 if (targets < conf->raid_disks) { 1039 * we need to prepare for allocating another r1_bio.
851 /* array is degraded, we will not clear the bitmap 1040 */
852 * on I/O completion (see raid1_end_write_request) */ 1041 r1_bio->sectors = max_sectors;
853 set_bit(R1BIO_Degraded, &r1_bio->state); 1042 spin_lock_irq(&conf->device_lock);
1043 if (bio->bi_phys_segments == 0)
1044 bio->bi_phys_segments = 2;
1045 else
1046 bio->bi_phys_segments++;
1047 spin_unlock_irq(&conf->device_lock);
854 } 1048 }
855 1049 sectors_handled = r1_bio->sector + max_sectors - bio->bi_sector;
856 /* do behind I/O ?
857 * Not if there are too many, or cannot allocate memory,
858 * or a reader on WriteMostly is waiting for behind writes
859 * to flush */
860 if (bitmap &&
861 (atomic_read(&bitmap->behind_writes)
862 < mddev->bitmap_info.max_write_behind) &&
863 !waitqueue_active(&bitmap->behind_wait))
864 alloc_behind_pages(bio, r1_bio);
865 1050
866 atomic_set(&r1_bio->remaining, 1); 1051 atomic_set(&r1_bio->remaining, 1);
867 atomic_set(&r1_bio->behind_remaining, 0); 1052 atomic_set(&r1_bio->behind_remaining, 0);
868 1053
869 bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors, 1054 first_clone = 1;
870 test_bit(R1BIO_BehindIO, &r1_bio->state));
871 for (i = 0; i < disks; i++) { 1055 for (i = 0; i < disks; i++) {
872 struct bio *mbio; 1056 struct bio *mbio;
873 if (!r1_bio->bios[i]) 1057 if (!r1_bio->bios[i])
874 continue; 1058 continue;
875 1059
876 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); 1060 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
877 r1_bio->bios[i] = mbio; 1061 md_trim_bio(mbio, r1_bio->sector - bio->bi_sector, max_sectors);
878 1062
879 mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; 1063 if (first_clone) {
880 mbio->bi_bdev = conf->mirrors[i].rdev->bdev; 1064 /* do behind I/O ?
881 mbio->bi_end_io = raid1_end_write_request; 1065 * Not if there are too many, or cannot
882 mbio->bi_rw = WRITE | do_flush_fua | do_sync; 1066 * allocate memory, or a reader on WriteMostly
883 mbio->bi_private = r1_bio; 1067 * is waiting for behind writes to flush */
884 1068 if (bitmap &&
885 if (r1_bio->behind_pages) { 1069 (atomic_read(&bitmap->behind_writes)
1070 < mddev->bitmap_info.max_write_behind) &&
1071 !waitqueue_active(&bitmap->behind_wait))
1072 alloc_behind_pages(mbio, r1_bio);
1073
1074 bitmap_startwrite(bitmap, r1_bio->sector,
1075 r1_bio->sectors,
1076 test_bit(R1BIO_BehindIO,
1077 &r1_bio->state));
1078 first_clone = 0;
1079 }
1080 if (r1_bio->behind_bvecs) {
886 struct bio_vec *bvec; 1081 struct bio_vec *bvec;
887 int j; 1082 int j;
888 1083
@@ -894,16 +1089,42 @@ static int make_request(mddev_t *mddev, struct bio * bio)
894 * them all 1089 * them all
895 */ 1090 */
896 __bio_for_each_segment(bvec, mbio, j, 0) 1091 __bio_for_each_segment(bvec, mbio, j, 0)
897 bvec->bv_page = r1_bio->behind_pages[j]; 1092 bvec->bv_page = r1_bio->behind_bvecs[j].bv_page;
898 if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) 1093 if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
899 atomic_inc(&r1_bio->behind_remaining); 1094 atomic_inc(&r1_bio->behind_remaining);
900 } 1095 }
901 1096
1097 r1_bio->bios[i] = mbio;
1098
1099 mbio->bi_sector = (r1_bio->sector +
1100 conf->mirrors[i].rdev->data_offset);
1101 mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
1102 mbio->bi_end_io = raid1_end_write_request;
1103 mbio->bi_rw = WRITE | do_flush_fua | do_sync;
1104 mbio->bi_private = r1_bio;
1105
902 atomic_inc(&r1_bio->remaining); 1106 atomic_inc(&r1_bio->remaining);
903 spin_lock_irqsave(&conf->device_lock, flags); 1107 spin_lock_irqsave(&conf->device_lock, flags);
904 bio_list_add(&conf->pending_bio_list, mbio); 1108 bio_list_add(&conf->pending_bio_list, mbio);
905 spin_unlock_irqrestore(&conf->device_lock, flags); 1109 spin_unlock_irqrestore(&conf->device_lock, flags);
906 } 1110 }
1111 /* Mustn't call r1_bio_write_done before this next test,
1112 * as it could result in the bio being freed.
1113 */
1114 if (sectors_handled < (bio->bi_size >> 9)) {
1115 r1_bio_write_done(r1_bio);
1116 /* We need another r1_bio. It has already been counted
1117 * in bio->bi_phys_segments
1118 */
1119 r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
1120 r1_bio->master_bio = bio;
1121 r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
1122 r1_bio->state = 0;
1123 r1_bio->mddev = mddev;
1124 r1_bio->sector = bio->bi_sector + sectors_handled;
1125 goto retry_write;
1126 }
1127
907 r1_bio_write_done(r1_bio); 1128 r1_bio_write_done(r1_bio);
908 1129
909 /* In case raid1d snuck in to freeze_array */ 1130 /* In case raid1d snuck in to freeze_array */
@@ -952,9 +1173,10 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
952 * However don't try a recovery from this drive as 1173 * However don't try a recovery from this drive as
953 * it is very likely to fail. 1174 * it is very likely to fail.
954 */ 1175 */
955 mddev->recovery_disabled = 1; 1176 conf->recovery_disabled = mddev->recovery_disabled;
956 return; 1177 return;
957 } 1178 }
1179 set_bit(Blocked, &rdev->flags);
958 if (test_and_clear_bit(In_sync, &rdev->flags)) { 1180 if (test_and_clear_bit(In_sync, &rdev->flags)) {
959 unsigned long flags; 1181 unsigned long flags;
960 spin_lock_irqsave(&conf->device_lock, flags); 1182 spin_lock_irqsave(&conf->device_lock, flags);
@@ -1027,7 +1249,7 @@ static int raid1_spare_active(mddev_t *mddev)
1027 && !test_bit(Faulty, &rdev->flags) 1249 && !test_bit(Faulty, &rdev->flags)
1028 && !test_and_set_bit(In_sync, &rdev->flags)) { 1250 && !test_and_set_bit(In_sync, &rdev->flags)) {
1029 count++; 1251 count++;
1030 sysfs_notify_dirent(rdev->sysfs_state); 1252 sysfs_notify_dirent_safe(rdev->sysfs_state);
1031 } 1253 }
1032 } 1254 }
1033 spin_lock_irqsave(&conf->device_lock, flags); 1255 spin_lock_irqsave(&conf->device_lock, flags);
@@ -1048,6 +1270,9 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1048 int first = 0; 1270 int first = 0;
1049 int last = mddev->raid_disks - 1; 1271 int last = mddev->raid_disks - 1;
1050 1272
1273 if (mddev->recovery_disabled == conf->recovery_disabled)
1274 return -EBUSY;
1275
1051 if (rdev->raid_disk >= 0) 1276 if (rdev->raid_disk >= 0)
1052 first = last = rdev->raid_disk; 1277 first = last = rdev->raid_disk;
1053 1278
@@ -1103,7 +1328,7 @@ static int raid1_remove_disk(mddev_t *mddev, int number)
1103 * is not possible. 1328 * is not possible.
1104 */ 1329 */
1105 if (!test_bit(Faulty, &rdev->flags) && 1330 if (!test_bit(Faulty, &rdev->flags) &&
1106 !mddev->recovery_disabled && 1331 mddev->recovery_disabled != conf->recovery_disabled &&
1107 mddev->degraded < conf->raid_disks) { 1332 mddev->degraded < conf->raid_disks) {
1108 err = -EBUSY; 1333 err = -EBUSY;
1109 goto abort; 1334 goto abort;
@@ -1155,6 +1380,8 @@ static void end_sync_write(struct bio *bio, int error)
1155 conf_t *conf = mddev->private; 1380 conf_t *conf = mddev->private;
1156 int i; 1381 int i;
1157 int mirror=0; 1382 int mirror=0;
1383 sector_t first_bad;
1384 int bad_sectors;
1158 1385
1159 for (i = 0; i < conf->raid_disks; i++) 1386 for (i = 0; i < conf->raid_disks; i++)
1160 if (r1_bio->bios[i] == bio) { 1387 if (r1_bio->bios[i] == bio) {
@@ -1172,18 +1399,48 @@ static void end_sync_write(struct bio *bio, int error)
1172 s += sync_blocks; 1399 s += sync_blocks;
1173 sectors_to_go -= sync_blocks; 1400 sectors_to_go -= sync_blocks;
1174 } while (sectors_to_go > 0); 1401 } while (sectors_to_go > 0);
1175 md_error(mddev, conf->mirrors[mirror].rdev); 1402 set_bit(WriteErrorSeen,
1176 } 1403 &conf->mirrors[mirror].rdev->flags);
1404 set_bit(R1BIO_WriteError, &r1_bio->state);
1405 } else if (is_badblock(conf->mirrors[mirror].rdev,
1406 r1_bio->sector,
1407 r1_bio->sectors,
1408 &first_bad, &bad_sectors) &&
1409 !is_badblock(conf->mirrors[r1_bio->read_disk].rdev,
1410 r1_bio->sector,
1411 r1_bio->sectors,
1412 &first_bad, &bad_sectors)
1413 )
1414 set_bit(R1BIO_MadeGood, &r1_bio->state);
1177 1415
1178 update_head_pos(mirror, r1_bio); 1416 update_head_pos(mirror, r1_bio);
1179 1417
1180 if (atomic_dec_and_test(&r1_bio->remaining)) { 1418 if (atomic_dec_and_test(&r1_bio->remaining)) {
1181 sector_t s = r1_bio->sectors; 1419 int s = r1_bio->sectors;
1182 put_buf(r1_bio); 1420 if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
1183 md_done_sync(mddev, s, uptodate); 1421 test_bit(R1BIO_WriteError, &r1_bio->state))
1422 reschedule_retry(r1_bio);
1423 else {
1424 put_buf(r1_bio);
1425 md_done_sync(mddev, s, uptodate);
1426 }
1184 } 1427 }
1185} 1428}
1186 1429
1430static int r1_sync_page_io(mdk_rdev_t *rdev, sector_t sector,
1431 int sectors, struct page *page, int rw)
1432{
1433 if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
1434 /* success */
1435 return 1;
1436 if (rw == WRITE)
1437 set_bit(WriteErrorSeen, &rdev->flags);
1438 /* need to record an error - either for the block or the device */
1439 if (!rdev_set_badblocks(rdev, sector, sectors, 0))
1440 md_error(rdev->mddev, rdev);
1441 return 0;
1442}
1443
1187static int fix_sync_read_error(r1bio_t *r1_bio) 1444static int fix_sync_read_error(r1bio_t *r1_bio)
1188{ 1445{
1189 /* Try some synchronous reads of other devices to get 1446 /* Try some synchronous reads of other devices to get
@@ -1193,6 +1450,9 @@ static int fix_sync_read_error(r1bio_t *r1_bio)
1193 * We don't need to freeze the array, because being in an 1450 * We don't need to freeze the array, because being in an
1194 * active sync request, there is no normal IO, and 1451 * active sync request, there is no normal IO, and
1195 * no overlapping syncs. 1452 * no overlapping syncs.
1453 * We don't need to check is_badblock() again as we
1454 * made sure that anything with a bad block in range
1455 * will have bi_end_io clear.
1196 */ 1456 */
1197 mddev_t *mddev = r1_bio->mddev; 1457 mddev_t *mddev = r1_bio->mddev;
1198 conf_t *conf = mddev->private; 1458 conf_t *conf = mddev->private;
@@ -1217,9 +1477,7 @@ static int fix_sync_read_error(r1bio_t *r1_bio)
1217 * active, and resync is currently active 1477 * active, and resync is currently active
1218 */ 1478 */
1219 rdev = conf->mirrors[d].rdev; 1479 rdev = conf->mirrors[d].rdev;
1220 if (sync_page_io(rdev, 1480 if (sync_page_io(rdev, sect, s<<9,
1221 sect,
1222 s<<9,
1223 bio->bi_io_vec[idx].bv_page, 1481 bio->bi_io_vec[idx].bv_page,
1224 READ, false)) { 1482 READ, false)) {
1225 success = 1; 1483 success = 1;
@@ -1233,16 +1491,36 @@ static int fix_sync_read_error(r1bio_t *r1_bio)
1233 1491
1234 if (!success) { 1492 if (!success) {
1235 char b[BDEVNAME_SIZE]; 1493 char b[BDEVNAME_SIZE];
1236 /* Cannot read from anywhere, array is toast */ 1494 int abort = 0;
1237 md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); 1495 /* Cannot read from anywhere, this block is lost.
1496 * Record a bad block on each device. If that doesn't
1497 * work just disable and interrupt the recovery.
1498 * Don't fail devices as that won't really help.
1499 */
1238 printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error" 1500 printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error"
1239 " for block %llu\n", 1501 " for block %llu\n",
1240 mdname(mddev), 1502 mdname(mddev),
1241 bdevname(bio->bi_bdev, b), 1503 bdevname(bio->bi_bdev, b),
1242 (unsigned long long)r1_bio->sector); 1504 (unsigned long long)r1_bio->sector);
1243 md_done_sync(mddev, r1_bio->sectors, 0); 1505 for (d = 0; d < conf->raid_disks; d++) {
1244 put_buf(r1_bio); 1506 rdev = conf->mirrors[d].rdev;
1245 return 0; 1507 if (!rdev || test_bit(Faulty, &rdev->flags))
1508 continue;
1509 if (!rdev_set_badblocks(rdev, sect, s, 0))
1510 abort = 1;
1511 }
1512 if (abort) {
1513 mddev->recovery_disabled = 1;
1514 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1515 md_done_sync(mddev, r1_bio->sectors, 0);
1516 put_buf(r1_bio);
1517 return 0;
1518 }
1519 /* Try next page */
1520 sectors -= s;
1521 sect += s;
1522 idx++;
1523 continue;
1246 } 1524 }
1247 1525
1248 start = d; 1526 start = d;
@@ -1254,16 +1532,12 @@ static int fix_sync_read_error(r1bio_t *r1_bio)
1254 if (r1_bio->bios[d]->bi_end_io != end_sync_read) 1532 if (r1_bio->bios[d]->bi_end_io != end_sync_read)
1255 continue; 1533 continue;
1256 rdev = conf->mirrors[d].rdev; 1534 rdev = conf->mirrors[d].rdev;
1257 if (sync_page_io(rdev, 1535 if (r1_sync_page_io(rdev, sect, s,
1258 sect, 1536 bio->bi_io_vec[idx].bv_page,
1259 s<<9, 1537 WRITE) == 0) {
1260 bio->bi_io_vec[idx].bv_page,
1261 WRITE, false) == 0) {
1262 r1_bio->bios[d]->bi_end_io = NULL; 1538 r1_bio->bios[d]->bi_end_io = NULL;
1263 rdev_dec_pending(rdev, mddev); 1539 rdev_dec_pending(rdev, mddev);
1264 md_error(mddev, rdev); 1540 }
1265 } else
1266 atomic_add(s, &rdev->corrected_errors);
1267 } 1541 }
1268 d = start; 1542 d = start;
1269 while (d != r1_bio->read_disk) { 1543 while (d != r1_bio->read_disk) {
@@ -1273,12 +1547,10 @@ static int fix_sync_read_error(r1bio_t *r1_bio)
1273 if (r1_bio->bios[d]->bi_end_io != end_sync_read) 1547 if (r1_bio->bios[d]->bi_end_io != end_sync_read)
1274 continue; 1548 continue;
1275 rdev = conf->mirrors[d].rdev; 1549 rdev = conf->mirrors[d].rdev;
1276 if (sync_page_io(rdev, 1550 if (r1_sync_page_io(rdev, sect, s,
1277 sect, 1551 bio->bi_io_vec[idx].bv_page,
1278 s<<9, 1552 READ) != 0)
1279 bio->bi_io_vec[idx].bv_page, 1553 atomic_add(s, &rdev->corrected_errors);
1280 READ, false) == 0)
1281 md_error(mddev, rdev);
1282 } 1554 }
1283 sectors -= s; 1555 sectors -= s;
1284 sect += s; 1556 sect += s;
@@ -1420,7 +1692,7 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
1420 * 1692 *
1421 * 1. Retries failed read operations on working mirrors. 1693 * 1. Retries failed read operations on working mirrors.
1422 * 2. Updates the raid superblock when problems encounter. 1694 * 2. Updates the raid superblock when problems encounter.
1423 * 3. Performs writes following reads for array syncronising. 1695 * 3. Performs writes following reads for array synchronising.
1424 */ 1696 */
1425 1697
1426static void fix_read_error(conf_t *conf, int read_disk, 1698static void fix_read_error(conf_t *conf, int read_disk,
@@ -1443,9 +1715,14 @@ static void fix_read_error(conf_t *conf, int read_disk,
1443 * which is the thread that might remove 1715 * which is the thread that might remove
1444 * a device. If raid1d ever becomes multi-threaded.... 1716 * a device. If raid1d ever becomes multi-threaded....
1445 */ 1717 */
1718 sector_t first_bad;
1719 int bad_sectors;
1720
1446 rdev = conf->mirrors[d].rdev; 1721 rdev = conf->mirrors[d].rdev;
1447 if (rdev && 1722 if (rdev &&
1448 test_bit(In_sync, &rdev->flags) && 1723 test_bit(In_sync, &rdev->flags) &&
1724 is_badblock(rdev, sect, s,
1725 &first_bad, &bad_sectors) == 0 &&
1449 sync_page_io(rdev, sect, s<<9, 1726 sync_page_io(rdev, sect, s<<9,
1450 conf->tmppage, READ, false)) 1727 conf->tmppage, READ, false))
1451 success = 1; 1728 success = 1;
@@ -1457,8 +1734,10 @@ static void fix_read_error(conf_t *conf, int read_disk,
1457 } while (!success && d != read_disk); 1734 } while (!success && d != read_disk);
1458 1735
1459 if (!success) { 1736 if (!success) {
1460 /* Cannot read from anywhere -- bye bye array */ 1737 /* Cannot read from anywhere - mark it bad */
1461 md_error(mddev, conf->mirrors[read_disk].rdev); 1738 mdk_rdev_t *rdev = conf->mirrors[read_disk].rdev;
1739 if (!rdev_set_badblocks(rdev, sect, s, 0))
1740 md_error(mddev, rdev);
1462 break; 1741 break;
1463 } 1742 }
1464 /* write it back and re-read */ 1743 /* write it back and re-read */
@@ -1469,13 +1748,9 @@ static void fix_read_error(conf_t *conf, int read_disk,
1469 d--; 1748 d--;
1470 rdev = conf->mirrors[d].rdev; 1749 rdev = conf->mirrors[d].rdev;
1471 if (rdev && 1750 if (rdev &&
1472 test_bit(In_sync, &rdev->flags)) { 1751 test_bit(In_sync, &rdev->flags))
1473 if (sync_page_io(rdev, sect, s<<9, 1752 r1_sync_page_io(rdev, sect, s,
1474 conf->tmppage, WRITE, false) 1753 conf->tmppage, WRITE);
1475 == 0)
1476 /* Well, this device is dead */
1477 md_error(mddev, rdev);
1478 }
1479 } 1754 }
1480 d = start; 1755 d = start;
1481 while (d != read_disk) { 1756 while (d != read_disk) {
@@ -1486,12 +1761,8 @@ static void fix_read_error(conf_t *conf, int read_disk,
1486 rdev = conf->mirrors[d].rdev; 1761 rdev = conf->mirrors[d].rdev;
1487 if (rdev && 1762 if (rdev &&
1488 test_bit(In_sync, &rdev->flags)) { 1763 test_bit(In_sync, &rdev->flags)) {
1489 if (sync_page_io(rdev, sect, s<<9, 1764 if (r1_sync_page_io(rdev, sect, s,
1490 conf->tmppage, READ, false) 1765 conf->tmppage, READ)) {
1491 == 0)
1492 /* Well, this device is dead */
1493 md_error(mddev, rdev);
1494 else {
1495 atomic_add(s, &rdev->corrected_errors); 1766 atomic_add(s, &rdev->corrected_errors);
1496 printk(KERN_INFO 1767 printk(KERN_INFO
1497 "md/raid1:%s: read error corrected " 1768 "md/raid1:%s: read error corrected "
@@ -1508,21 +1779,255 @@ static void fix_read_error(conf_t *conf, int read_disk,
1508 } 1779 }
1509} 1780}
1510 1781
1782static void bi_complete(struct bio *bio, int error)
1783{
1784 complete((struct completion *)bio->bi_private);
1785}
1786
1787static int submit_bio_wait(int rw, struct bio *bio)
1788{
1789 struct completion event;
1790 rw |= REQ_SYNC;
1791
1792 init_completion(&event);
1793 bio->bi_private = &event;
1794 bio->bi_end_io = bi_complete;
1795 submit_bio(rw, bio);
1796 wait_for_completion(&event);
1797
1798 return test_bit(BIO_UPTODATE, &bio->bi_flags);
1799}
1800
1801static int narrow_write_error(r1bio_t *r1_bio, int i)
1802{
1803 mddev_t *mddev = r1_bio->mddev;
1804 conf_t *conf = mddev->private;
1805 mdk_rdev_t *rdev = conf->mirrors[i].rdev;
1806 int vcnt, idx;
1807 struct bio_vec *vec;
1808
1809 /* bio has the data to be written to device 'i' where
1810 * we just recently had a write error.
1811 * We repeatedly clone the bio and trim down to one block,
1812 * then try the write. Where the write fails we record
1813 * a bad block.
1814 * It is conceivable that the bio doesn't exactly align with
1815 * blocks. We must handle this somehow.
1816 *
1817 * We currently own a reference on the rdev.
1818 */
1819
1820 int block_sectors;
1821 sector_t sector;
1822 int sectors;
1823 int sect_to_write = r1_bio->sectors;
1824 int ok = 1;
1825
1826 if (rdev->badblocks.shift < 0)
1827 return 0;
1828
1829 block_sectors = 1 << rdev->badblocks.shift;
1830 sector = r1_bio->sector;
1831 sectors = ((sector + block_sectors)
1832 & ~(sector_t)(block_sectors - 1))
1833 - sector;
1834
1835 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
1836 vcnt = r1_bio->behind_page_count;
1837 vec = r1_bio->behind_bvecs;
1838 idx = 0;
1839 while (vec[idx].bv_page == NULL)
1840 idx++;
1841 } else {
1842 vcnt = r1_bio->master_bio->bi_vcnt;
1843 vec = r1_bio->master_bio->bi_io_vec;
1844 idx = r1_bio->master_bio->bi_idx;
1845 }
1846 while (sect_to_write) {
1847 struct bio *wbio;
1848 if (sectors > sect_to_write)
1849 sectors = sect_to_write;
1850 /* Write at 'sector' for 'sectors'*/
1851
1852 wbio = bio_alloc_mddev(GFP_NOIO, vcnt, mddev);
1853 memcpy(wbio->bi_io_vec, vec, vcnt * sizeof(struct bio_vec));
1854 wbio->bi_sector = r1_bio->sector;
1855 wbio->bi_rw = WRITE;
1856 wbio->bi_vcnt = vcnt;
1857 wbio->bi_size = r1_bio->sectors << 9;
1858 wbio->bi_idx = idx;
1859
1860 md_trim_bio(wbio, sector - r1_bio->sector, sectors);
1861 wbio->bi_sector += rdev->data_offset;
1862 wbio->bi_bdev = rdev->bdev;
1863 if (submit_bio_wait(WRITE, wbio) == 0)
1864 /* failure! */
1865 ok = rdev_set_badblocks(rdev, sector,
1866 sectors, 0)
1867 && ok;
1868
1869 bio_put(wbio);
1870 sect_to_write -= sectors;
1871 sector += sectors;
1872 sectors = block_sectors;
1873 }
1874 return ok;
1875}
1876
1877static void handle_sync_write_finished(conf_t *conf, r1bio_t *r1_bio)
1878{
1879 int m;
1880 int s = r1_bio->sectors;
1881 for (m = 0; m < conf->raid_disks ; m++) {
1882 mdk_rdev_t *rdev = conf->mirrors[m].rdev;
1883 struct bio *bio = r1_bio->bios[m];
1884 if (bio->bi_end_io == NULL)
1885 continue;
1886 if (test_bit(BIO_UPTODATE, &bio->bi_flags) &&
1887 test_bit(R1BIO_MadeGood, &r1_bio->state)) {
1888 rdev_clear_badblocks(rdev, r1_bio->sector, s);
1889 }
1890 if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
1891 test_bit(R1BIO_WriteError, &r1_bio->state)) {
1892 if (!rdev_set_badblocks(rdev, r1_bio->sector, s, 0))
1893 md_error(conf->mddev, rdev);
1894 }
1895 }
1896 put_buf(r1_bio);
1897 md_done_sync(conf->mddev, s, 1);
1898}
1899
1900static void handle_write_finished(conf_t *conf, r1bio_t *r1_bio)
1901{
1902 int m;
1903 for (m = 0; m < conf->raid_disks ; m++)
1904 if (r1_bio->bios[m] == IO_MADE_GOOD) {
1905 mdk_rdev_t *rdev = conf->mirrors[m].rdev;
1906 rdev_clear_badblocks(rdev,
1907 r1_bio->sector,
1908 r1_bio->sectors);
1909 rdev_dec_pending(rdev, conf->mddev);
1910 } else if (r1_bio->bios[m] != NULL) {
1911 /* This drive got a write error. We need to
1912 * narrow down and record precise write
1913 * errors.
1914 */
1915 if (!narrow_write_error(r1_bio, m)) {
1916 md_error(conf->mddev,
1917 conf->mirrors[m].rdev);
1918 /* an I/O failed, we can't clear the bitmap */
1919 set_bit(R1BIO_Degraded, &r1_bio->state);
1920 }
1921 rdev_dec_pending(conf->mirrors[m].rdev,
1922 conf->mddev);
1923 }
1924 if (test_bit(R1BIO_WriteError, &r1_bio->state))
1925 close_write(r1_bio);
1926 raid_end_bio_io(r1_bio);
1927}
1928
1929static void handle_read_error(conf_t *conf, r1bio_t *r1_bio)
1930{
1931 int disk;
1932 int max_sectors;
1933 mddev_t *mddev = conf->mddev;
1934 struct bio *bio;
1935 char b[BDEVNAME_SIZE];
1936 mdk_rdev_t *rdev;
1937
1938 clear_bit(R1BIO_ReadError, &r1_bio->state);
1939 /* we got a read error. Maybe the drive is bad. Maybe just
1940 * the block and we can fix it.
1941 * We freeze all other IO, and try reading the block from
1942 * other devices. When we find one, we re-write
1943 * and check it that fixes the read error.
1944 * This is all done synchronously while the array is
1945 * frozen
1946 */
1947 if (mddev->ro == 0) {
1948 freeze_array(conf);
1949 fix_read_error(conf, r1_bio->read_disk,
1950 r1_bio->sector, r1_bio->sectors);
1951 unfreeze_array(conf);
1952 } else
1953 md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
1954
1955 bio = r1_bio->bios[r1_bio->read_disk];
1956 bdevname(bio->bi_bdev, b);
1957read_more:
1958 disk = read_balance(conf, r1_bio, &max_sectors);
1959 if (disk == -1) {
1960 printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O"
1961 " read error for block %llu\n",
1962 mdname(mddev), b, (unsigned long long)r1_bio->sector);
1963 raid_end_bio_io(r1_bio);
1964 } else {
1965 const unsigned long do_sync
1966 = r1_bio->master_bio->bi_rw & REQ_SYNC;
1967 if (bio) {
1968 r1_bio->bios[r1_bio->read_disk] =
1969 mddev->ro ? IO_BLOCKED : NULL;
1970 bio_put(bio);
1971 }
1972 r1_bio->read_disk = disk;
1973 bio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev);
1974 md_trim_bio(bio, r1_bio->sector - bio->bi_sector, max_sectors);
1975 r1_bio->bios[r1_bio->read_disk] = bio;
1976 rdev = conf->mirrors[disk].rdev;
1977 printk_ratelimited(KERN_ERR
1978 "md/raid1:%s: redirecting sector %llu"
1979 " to other mirror: %s\n",
1980 mdname(mddev),
1981 (unsigned long long)r1_bio->sector,
1982 bdevname(rdev->bdev, b));
1983 bio->bi_sector = r1_bio->sector + rdev->data_offset;
1984 bio->bi_bdev = rdev->bdev;
1985 bio->bi_end_io = raid1_end_read_request;
1986 bio->bi_rw = READ | do_sync;
1987 bio->bi_private = r1_bio;
1988 if (max_sectors < r1_bio->sectors) {
1989 /* Drat - have to split this up more */
1990 struct bio *mbio = r1_bio->master_bio;
1991 int sectors_handled = (r1_bio->sector + max_sectors
1992 - mbio->bi_sector);
1993 r1_bio->sectors = max_sectors;
1994 spin_lock_irq(&conf->device_lock);
1995 if (mbio->bi_phys_segments == 0)
1996 mbio->bi_phys_segments = 2;
1997 else
1998 mbio->bi_phys_segments++;
1999 spin_unlock_irq(&conf->device_lock);
2000 generic_make_request(bio);
2001 bio = NULL;
2002
2003 r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
2004
2005 r1_bio->master_bio = mbio;
2006 r1_bio->sectors = (mbio->bi_size >> 9)
2007 - sectors_handled;
2008 r1_bio->state = 0;
2009 set_bit(R1BIO_ReadError, &r1_bio->state);
2010 r1_bio->mddev = mddev;
2011 r1_bio->sector = mbio->bi_sector + sectors_handled;
2012
2013 goto read_more;
2014 } else
2015 generic_make_request(bio);
2016 }
2017}
2018
1511static void raid1d(mddev_t *mddev) 2019static void raid1d(mddev_t *mddev)
1512{ 2020{
1513 r1bio_t *r1_bio; 2021 r1bio_t *r1_bio;
1514 struct bio *bio;
1515 unsigned long flags; 2022 unsigned long flags;
1516 conf_t *conf = mddev->private; 2023 conf_t *conf = mddev->private;
1517 struct list_head *head = &conf->retry_list; 2024 struct list_head *head = &conf->retry_list;
1518 mdk_rdev_t *rdev;
1519 struct blk_plug plug; 2025 struct blk_plug plug;
1520 2026
1521 md_check_recovery(mddev); 2027 md_check_recovery(mddev);
1522 2028
1523 blk_start_plug(&plug); 2029 blk_start_plug(&plug);
1524 for (;;) { 2030 for (;;) {
1525 char b[BDEVNAME_SIZE];
1526 2031
1527 if (atomic_read(&mddev->plug_cnt) == 0) 2032 if (atomic_read(&mddev->plug_cnt) == 0)
1528 flush_pending_writes(conf); 2033 flush_pending_writes(conf);
@@ -1539,62 +2044,26 @@ static void raid1d(mddev_t *mddev)
1539 2044
1540 mddev = r1_bio->mddev; 2045 mddev = r1_bio->mddev;
1541 conf = mddev->private; 2046 conf = mddev->private;
1542 if (test_bit(R1BIO_IsSync, &r1_bio->state)) 2047 if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
1543 sync_request_write(mddev, r1_bio); 2048 if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
1544 else { 2049 test_bit(R1BIO_WriteError, &r1_bio->state))
1545 int disk; 2050 handle_sync_write_finished(conf, r1_bio);
1546 2051 else
1547 /* we got a read error. Maybe the drive is bad. Maybe just 2052 sync_request_write(mddev, r1_bio);
1548 * the block and we can fix it. 2053 } else if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
1549 * We freeze all other IO, and try reading the block from 2054 test_bit(R1BIO_WriteError, &r1_bio->state))
1550 * other devices. When we find one, we re-write 2055 handle_write_finished(conf, r1_bio);
1551 * and check it that fixes the read error. 2056 else if (test_bit(R1BIO_ReadError, &r1_bio->state))
1552 * This is all done synchronously while the array is 2057 handle_read_error(conf, r1_bio);
1553 * frozen 2058 else
2059 /* just a partial read to be scheduled from separate
2060 * context
1554 */ 2061 */
1555 if (mddev->ro == 0) { 2062 generic_make_request(r1_bio->bios[r1_bio->read_disk]);
1556 freeze_array(conf); 2063
1557 fix_read_error(conf, r1_bio->read_disk,
1558 r1_bio->sector,
1559 r1_bio->sectors);
1560 unfreeze_array(conf);
1561 } else
1562 md_error(mddev,
1563 conf->mirrors[r1_bio->read_disk].rdev);
1564
1565 bio = r1_bio->bios[r1_bio->read_disk];
1566 if ((disk=read_balance(conf, r1_bio)) == -1) {
1567 printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O"
1568 " read error for block %llu\n",
1569 mdname(mddev),
1570 bdevname(bio->bi_bdev,b),
1571 (unsigned long long)r1_bio->sector);
1572 raid_end_bio_io(r1_bio);
1573 } else {
1574 const unsigned long do_sync = r1_bio->master_bio->bi_rw & REQ_SYNC;
1575 r1_bio->bios[r1_bio->read_disk] =
1576 mddev->ro ? IO_BLOCKED : NULL;
1577 r1_bio->read_disk = disk;
1578 bio_put(bio);
1579 bio = bio_clone_mddev(r1_bio->master_bio,
1580 GFP_NOIO, mddev);
1581 r1_bio->bios[r1_bio->read_disk] = bio;
1582 rdev = conf->mirrors[disk].rdev;
1583 if (printk_ratelimit())
1584 printk(KERN_ERR "md/raid1:%s: redirecting sector %llu to"
1585 " other mirror: %s\n",
1586 mdname(mddev),
1587 (unsigned long long)r1_bio->sector,
1588 bdevname(rdev->bdev,b));
1589 bio->bi_sector = r1_bio->sector + rdev->data_offset;
1590 bio->bi_bdev = rdev->bdev;
1591 bio->bi_end_io = raid1_end_read_request;
1592 bio->bi_rw = READ | do_sync;
1593 bio->bi_private = r1_bio;
1594 generic_make_request(bio);
1595 }
1596 }
1597 cond_resched(); 2064 cond_resched();
2065 if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
2066 md_check_recovery(mddev);
1598 } 2067 }
1599 blk_finish_plug(&plug); 2068 blk_finish_plug(&plug);
1600} 2069}
@@ -1636,6 +2105,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1636 int write_targets = 0, read_targets = 0; 2105 int write_targets = 0, read_targets = 0;
1637 sector_t sync_blocks; 2106 sector_t sync_blocks;
1638 int still_degraded = 0; 2107 int still_degraded = 0;
2108 int good_sectors = RESYNC_SECTORS;
2109 int min_bad = 0; /* number of sectors that are bad in all devices */
1639 2110
1640 if (!conf->r1buf_pool) 2111 if (!conf->r1buf_pool)
1641 if (init_resync(conf)) 2112 if (init_resync(conf))
@@ -1723,36 +2194,89 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1723 2194
1724 rdev = rcu_dereference(conf->mirrors[i].rdev); 2195 rdev = rcu_dereference(conf->mirrors[i].rdev);
1725 if (rdev == NULL || 2196 if (rdev == NULL ||
1726 test_bit(Faulty, &rdev->flags)) { 2197 test_bit(Faulty, &rdev->flags)) {
1727 still_degraded = 1; 2198 still_degraded = 1;
1728 continue;
1729 } else if (!test_bit(In_sync, &rdev->flags)) { 2199 } else if (!test_bit(In_sync, &rdev->flags)) {
1730 bio->bi_rw = WRITE; 2200 bio->bi_rw = WRITE;
1731 bio->bi_end_io = end_sync_write; 2201 bio->bi_end_io = end_sync_write;
1732 write_targets ++; 2202 write_targets ++;
1733 } else { 2203 } else {
1734 /* may need to read from here */ 2204 /* may need to read from here */
1735 bio->bi_rw = READ; 2205 sector_t first_bad = MaxSector;
1736 bio->bi_end_io = end_sync_read; 2206 int bad_sectors;
1737 if (test_bit(WriteMostly, &rdev->flags)) { 2207
1738 if (wonly < 0) 2208 if (is_badblock(rdev, sector_nr, good_sectors,
1739 wonly = i; 2209 &first_bad, &bad_sectors)) {
1740 } else { 2210 if (first_bad > sector_nr)
1741 if (disk < 0) 2211 good_sectors = first_bad - sector_nr;
1742 disk = i; 2212 else {
2213 bad_sectors -= (sector_nr - first_bad);
2214 if (min_bad == 0 ||
2215 min_bad > bad_sectors)
2216 min_bad = bad_sectors;
2217 }
2218 }
2219 if (sector_nr < first_bad) {
2220 if (test_bit(WriteMostly, &rdev->flags)) {
2221 if (wonly < 0)
2222 wonly = i;
2223 } else {
2224 if (disk < 0)
2225 disk = i;
2226 }
2227 bio->bi_rw = READ;
2228 bio->bi_end_io = end_sync_read;
2229 read_targets++;
1743 } 2230 }
1744 read_targets++;
1745 } 2231 }
1746 atomic_inc(&rdev->nr_pending); 2232 if (bio->bi_end_io) {
1747 bio->bi_sector = sector_nr + rdev->data_offset; 2233 atomic_inc(&rdev->nr_pending);
1748 bio->bi_bdev = rdev->bdev; 2234 bio->bi_sector = sector_nr + rdev->data_offset;
1749 bio->bi_private = r1_bio; 2235 bio->bi_bdev = rdev->bdev;
2236 bio->bi_private = r1_bio;
2237 }
1750 } 2238 }
1751 rcu_read_unlock(); 2239 rcu_read_unlock();
1752 if (disk < 0) 2240 if (disk < 0)
1753 disk = wonly; 2241 disk = wonly;
1754 r1_bio->read_disk = disk; 2242 r1_bio->read_disk = disk;
1755 2243
2244 if (read_targets == 0 && min_bad > 0) {
2245 /* These sectors are bad on all InSync devices, so we
2246 * need to mark them bad on all write targets
2247 */
2248 int ok = 1;
2249 for (i = 0 ; i < conf->raid_disks ; i++)
2250 if (r1_bio->bios[i]->bi_end_io == end_sync_write) {
2251 mdk_rdev_t *rdev =
2252 rcu_dereference(conf->mirrors[i].rdev);
2253 ok = rdev_set_badblocks(rdev, sector_nr,
2254 min_bad, 0
2255 ) && ok;
2256 }
2257 set_bit(MD_CHANGE_DEVS, &mddev->flags);
2258 *skipped = 1;
2259 put_buf(r1_bio);
2260
2261 if (!ok) {
2262 /* Cannot record the badblocks, so need to
2263 * abort the resync.
2264 * If there are multiple read targets, could just
2265 * fail the really bad ones ???
2266 */
2267 conf->recovery_disabled = mddev->recovery_disabled;
2268 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
2269 return 0;
2270 } else
2271 return min_bad;
2272
2273 }
2274 if (min_bad > 0 && min_bad < good_sectors) {
2275 /* only resync enough to reach the next bad->good
2276 * transition */
2277 good_sectors = min_bad;
2278 }
2279
1756 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && read_targets > 0) 2280 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && read_targets > 0)
1757 /* extra read targets are also write targets */ 2281 /* extra read targets are also write targets */
1758 write_targets += read_targets-1; 2282 write_targets += read_targets-1;
@@ -1769,6 +2293,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1769 2293
1770 if (max_sector > mddev->resync_max) 2294 if (max_sector > mddev->resync_max)
1771 max_sector = mddev->resync_max; /* Don't do IO beyond here */ 2295 max_sector = mddev->resync_max; /* Don't do IO beyond here */
2296 if (max_sector > sector_nr + good_sectors)
2297 max_sector = sector_nr + good_sectors;
1772 nr_sectors = 0; 2298 nr_sectors = 0;
1773 sync_blocks = 0; 2299 sync_blocks = 0;
1774 do { 2300 do {
@@ -2045,8 +2571,7 @@ static int stop(mddev_t *mddev)
2045 raise_barrier(conf); 2571 raise_barrier(conf);
2046 lower_barrier(conf); 2572 lower_barrier(conf);
2047 2573
2048 md_unregister_thread(mddev->thread); 2574 md_unregister_thread(&mddev->thread);
2049 mddev->thread = NULL;
2050 if (conf->r1bio_pool) 2575 if (conf->r1bio_pool)
2051 mempool_destroy(conf->r1bio_pool); 2576 mempool_destroy(conf->r1bio_pool);
2052 kfree(conf->mirrors); 2577 kfree(conf->mirrors);
@@ -2154,18 +2679,13 @@ static int raid1_reshape(mddev_t *mddev)
2154 for (d = d2 = 0; d < conf->raid_disks; d++) { 2679 for (d = d2 = 0; d < conf->raid_disks; d++) {
2155 mdk_rdev_t *rdev = conf->mirrors[d].rdev; 2680 mdk_rdev_t *rdev = conf->mirrors[d].rdev;
2156 if (rdev && rdev->raid_disk != d2) { 2681 if (rdev && rdev->raid_disk != d2) {
2157 char nm[20]; 2682 sysfs_unlink_rdev(mddev, rdev);
2158 sprintf(nm, "rd%d", rdev->raid_disk);
2159 sysfs_remove_link(&mddev->kobj, nm);
2160 rdev->raid_disk = d2; 2683 rdev->raid_disk = d2;
2161 sprintf(nm, "rd%d", rdev->raid_disk); 2684 sysfs_unlink_rdev(mddev, rdev);
2162 sysfs_remove_link(&mddev->kobj, nm); 2685 if (sysfs_link_rdev(mddev, rdev))
2163 if (sysfs_create_link(&mddev->kobj,
2164 &rdev->kobj, nm))
2165 printk(KERN_WARNING 2686 printk(KERN_WARNING
2166 "md/raid1:%s: cannot register " 2687 "md/raid1:%s: cannot register rd%d\n",
2167 "%s\n", 2688 mdname(mddev), rdev->raid_disk);
2168 mdname(mddev), nm);
2169 } 2689 }
2170 if (rdev) 2690 if (rdev)
2171 newmirrors[d2++].rdev = rdev; 2691 newmirrors[d2++].rdev = rdev;
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index e743a64fac4..e0d676b4897 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -48,6 +48,12 @@ struct r1_private_data_s {
48 * (fresh device added). 48 * (fresh device added).
49 * Cleared when a sync completes. 49 * Cleared when a sync completes.
50 */ 50 */
51 int recovery_disabled; /* when the same as
52 * mddev->recovery_disabled
53 * we don't allow recovery
54 * to be attempted as we
55 * expect a read error
56 */
51 57
52 wait_queue_head_t wait_barrier; 58 wait_queue_head_t wait_barrier;
53 59
@@ -95,7 +101,7 @@ struct r1bio_s {
95 101
96 struct list_head retry_list; 102 struct list_head retry_list;
97 /* Next two are only valid when R1BIO_BehindIO is set */ 103 /* Next two are only valid when R1BIO_BehindIO is set */
98 struct page **behind_pages; 104 struct bio_vec *behind_bvecs;
99 int behind_page_count; 105 int behind_page_count;
100 /* 106 /*
101 * if the IO is in WRITE direction, then multiple bios are used. 107 * if the IO is in WRITE direction, then multiple bios are used.
@@ -110,13 +116,24 @@ struct r1bio_s {
110 * correct the read error. To keep track of bad blocks on a per-bio 116 * correct the read error. To keep track of bad blocks on a per-bio
111 * level, we store IO_BLOCKED in the appropriate 'bios' pointer 117 * level, we store IO_BLOCKED in the appropriate 'bios' pointer
112 */ 118 */
113#define IO_BLOCKED ((struct bio*)1) 119#define IO_BLOCKED ((struct bio *)1)
120/* When we successfully write to a known bad-block, we need to remove the
121 * bad-block marking which must be done from process context. So we record
122 * the success by setting bios[n] to IO_MADE_GOOD
123 */
124#define IO_MADE_GOOD ((struct bio *)2)
125
126#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
114 127
115/* bits for r1bio.state */ 128/* bits for r1bio.state */
116#define R1BIO_Uptodate 0 129#define R1BIO_Uptodate 0
117#define R1BIO_IsSync 1 130#define R1BIO_IsSync 1
118#define R1BIO_Degraded 2 131#define R1BIO_Degraded 2
119#define R1BIO_BehindIO 3 132#define R1BIO_BehindIO 3
133/* Set ReadError on bios that experience a readerror so that
134 * raid1d knows what to do with them.
135 */
136#define R1BIO_ReadError 4
120/* For write-behind requests, we call bi_end_io when 137/* For write-behind requests, we call bi_end_io when
121 * the last non-write-behind device completes, providing 138 * the last non-write-behind device completes, providing
122 * any write was successful. Otherwise we call when 139 * any write was successful. Otherwise we call when
@@ -125,6 +142,11 @@ struct r1bio_s {
125 * Record that bi_end_io was called with this flag... 142 * Record that bi_end_io was called with this flag...
126 */ 143 */
127#define R1BIO_Returned 6 144#define R1BIO_Returned 6
145/* If a write for this request means we can clear some
146 * known-bad-block records, we set this flag
147 */
148#define R1BIO_MadeGood 7
149#define R1BIO_WriteError 8
128 150
129extern int md_raid1_congested(mddev_t *mddev, int bits); 151extern int md_raid1_congested(mddev_t *mddev, int bits);
130 152
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 6e846688962..1d44228530a 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -22,6 +22,7 @@
22#include <linux/delay.h> 22#include <linux/delay.h>
23#include <linux/blkdev.h> 23#include <linux/blkdev.h>
24#include <linux/seq_file.h> 24#include <linux/seq_file.h>
25#include <linux/ratelimit.h>
25#include "md.h" 26#include "md.h"
26#include "raid10.h" 27#include "raid10.h"
27#include "raid0.h" 28#include "raid0.h"
@@ -123,7 +124,14 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
123 for (j = 0 ; j < nalloc; j++) { 124 for (j = 0 ; j < nalloc; j++) {
124 bio = r10_bio->devs[j].bio; 125 bio = r10_bio->devs[j].bio;
125 for (i = 0; i < RESYNC_PAGES; i++) { 126 for (i = 0; i < RESYNC_PAGES; i++) {
126 page = alloc_page(gfp_flags); 127 if (j == 1 && !test_bit(MD_RECOVERY_SYNC,
128 &conf->mddev->recovery)) {
129 /* we can share bv_page's during recovery */
130 struct bio *rbio = r10_bio->devs[0].bio;
131 page = rbio->bi_io_vec[i].bv_page;
132 get_page(page);
133 } else
134 page = alloc_page(gfp_flags);
127 if (unlikely(!page)) 135 if (unlikely(!page))
128 goto out_free_pages; 136 goto out_free_pages;
129 137
@@ -173,7 +181,7 @@ static void put_all_bios(conf_t *conf, r10bio_t *r10_bio)
173 181
174 for (i = 0; i < conf->copies; i++) { 182 for (i = 0; i < conf->copies; i++) {
175 struct bio **bio = & r10_bio->devs[i].bio; 183 struct bio **bio = & r10_bio->devs[i].bio;
176 if (*bio && *bio != IO_BLOCKED) 184 if (!BIO_SPECIAL(*bio))
177 bio_put(*bio); 185 bio_put(*bio);
178 *bio = NULL; 186 *bio = NULL;
179 } 187 }
@@ -183,12 +191,6 @@ static void free_r10bio(r10bio_t *r10_bio)
183{ 191{
184 conf_t *conf = r10_bio->mddev->private; 192 conf_t *conf = r10_bio->mddev->private;
185 193
186 /*
187 * Wake up any possible resync thread that waits for the device
188 * to go idle.
189 */
190 allow_barrier(conf);
191
192 put_all_bios(conf, r10_bio); 194 put_all_bios(conf, r10_bio);
193 mempool_free(r10_bio, conf->r10bio_pool); 195 mempool_free(r10_bio, conf->r10bio_pool);
194} 196}
@@ -227,9 +229,27 @@ static void reschedule_retry(r10bio_t *r10_bio)
227static void raid_end_bio_io(r10bio_t *r10_bio) 229static void raid_end_bio_io(r10bio_t *r10_bio)
228{ 230{
229 struct bio *bio = r10_bio->master_bio; 231 struct bio *bio = r10_bio->master_bio;
232 int done;
233 conf_t *conf = r10_bio->mddev->private;
230 234
231 bio_endio(bio, 235 if (bio->bi_phys_segments) {
232 test_bit(R10BIO_Uptodate, &r10_bio->state) ? 0 : -EIO); 236 unsigned long flags;
237 spin_lock_irqsave(&conf->device_lock, flags);
238 bio->bi_phys_segments--;
239 done = (bio->bi_phys_segments == 0);
240 spin_unlock_irqrestore(&conf->device_lock, flags);
241 } else
242 done = 1;
243 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
244 clear_bit(BIO_UPTODATE, &bio->bi_flags);
245 if (done) {
246 bio_endio(bio, 0);
247 /*
248 * Wake up any possible resync thread that waits for the device
249 * to go idle.
250 */
251 allow_barrier(conf);
252 }
233 free_r10bio(r10_bio); 253 free_r10bio(r10_bio);
234} 254}
235 255
@@ -244,6 +264,26 @@ static inline void update_head_pos(int slot, r10bio_t *r10_bio)
244 r10_bio->devs[slot].addr + (r10_bio->sectors); 264 r10_bio->devs[slot].addr + (r10_bio->sectors);
245} 265}
246 266
267/*
268 * Find the disk number which triggered given bio
269 */
270static int find_bio_disk(conf_t *conf, r10bio_t *r10_bio,
271 struct bio *bio, int *slotp)
272{
273 int slot;
274
275 for (slot = 0; slot < conf->copies; slot++)
276 if (r10_bio->devs[slot].bio == bio)
277 break;
278
279 BUG_ON(slot == conf->copies);
280 update_head_pos(slot, r10_bio);
281
282 if (slotp)
283 *slotp = slot;
284 return r10_bio->devs[slot].devnum;
285}
286
247static void raid10_end_read_request(struct bio *bio, int error) 287static void raid10_end_read_request(struct bio *bio, int error)
248{ 288{
249 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 289 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -277,34 +317,60 @@ static void raid10_end_read_request(struct bio *bio, int error)
277 * oops, read error - keep the refcount on the rdev 317 * oops, read error - keep the refcount on the rdev
278 */ 318 */
279 char b[BDEVNAME_SIZE]; 319 char b[BDEVNAME_SIZE];
280 if (printk_ratelimit()) 320 printk_ratelimited(KERN_ERR
281 printk(KERN_ERR "md/raid10:%s: %s: rescheduling sector %llu\n", 321 "md/raid10:%s: %s: rescheduling sector %llu\n",
282 mdname(conf->mddev), 322 mdname(conf->mddev),
283 bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector); 323 bdevname(conf->mirrors[dev].rdev->bdev, b),
324 (unsigned long long)r10_bio->sector);
325 set_bit(R10BIO_ReadError, &r10_bio->state);
284 reschedule_retry(r10_bio); 326 reschedule_retry(r10_bio);
285 } 327 }
286} 328}
287 329
330static void close_write(r10bio_t *r10_bio)
331{
332 /* clear the bitmap if all writes complete successfully */
333 bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
334 r10_bio->sectors,
335 !test_bit(R10BIO_Degraded, &r10_bio->state),
336 0);
337 md_write_end(r10_bio->mddev);
338}
339
340static void one_write_done(r10bio_t *r10_bio)
341{
342 if (atomic_dec_and_test(&r10_bio->remaining)) {
343 if (test_bit(R10BIO_WriteError, &r10_bio->state))
344 reschedule_retry(r10_bio);
345 else {
346 close_write(r10_bio);
347 if (test_bit(R10BIO_MadeGood, &r10_bio->state))
348 reschedule_retry(r10_bio);
349 else
350 raid_end_bio_io(r10_bio);
351 }
352 }
353}
354
288static void raid10_end_write_request(struct bio *bio, int error) 355static void raid10_end_write_request(struct bio *bio, int error)
289{ 356{
290 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 357 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
291 r10bio_t *r10_bio = bio->bi_private; 358 r10bio_t *r10_bio = bio->bi_private;
292 int slot, dev; 359 int dev;
360 int dec_rdev = 1;
293 conf_t *conf = r10_bio->mddev->private; 361 conf_t *conf = r10_bio->mddev->private;
362 int slot;
294 363
295 for (slot = 0; slot < conf->copies; slot++) 364 dev = find_bio_disk(conf, r10_bio, bio, &slot);
296 if (r10_bio->devs[slot].bio == bio)
297 break;
298 dev = r10_bio->devs[slot].devnum;
299 365
300 /* 366 /*
301 * this branch is our 'one mirror IO has finished' event handler: 367 * this branch is our 'one mirror IO has finished' event handler:
302 */ 368 */
303 if (!uptodate) { 369 if (!uptodate) {
304 md_error(r10_bio->mddev, conf->mirrors[dev].rdev); 370 set_bit(WriteErrorSeen, &conf->mirrors[dev].rdev->flags);
305 /* an I/O failed, we can't clear the bitmap */ 371 set_bit(R10BIO_WriteError, &r10_bio->state);
306 set_bit(R10BIO_Degraded, &r10_bio->state); 372 dec_rdev = 0;
307 } else 373 } else {
308 /* 374 /*
309 * Set R10BIO_Uptodate in our master bio, so that 375 * Set R10BIO_Uptodate in our master bio, so that
310 * we will return a good error code for to the higher 376 * we will return a good error code for to the higher
@@ -314,26 +380,31 @@ static void raid10_end_write_request(struct bio *bio, int error)
314 * user-side. So if something waits for IO, then it will 380 * user-side. So if something waits for IO, then it will
315 * wait for the 'master' bio. 381 * wait for the 'master' bio.
316 */ 382 */
383 sector_t first_bad;
384 int bad_sectors;
385
317 set_bit(R10BIO_Uptodate, &r10_bio->state); 386 set_bit(R10BIO_Uptodate, &r10_bio->state);
318 387
319 update_head_pos(slot, r10_bio); 388 /* Maybe we can clear some bad blocks. */
389 if (is_badblock(conf->mirrors[dev].rdev,
390 r10_bio->devs[slot].addr,
391 r10_bio->sectors,
392 &first_bad, &bad_sectors)) {
393 bio_put(bio);
394 r10_bio->devs[slot].bio = IO_MADE_GOOD;
395 dec_rdev = 0;
396 set_bit(R10BIO_MadeGood, &r10_bio->state);
397 }
398 }
320 399
321 /* 400 /*
322 * 401 *
323 * Let's see if all mirrored write operations have finished 402 * Let's see if all mirrored write operations have finished
324 * already. 403 * already.
325 */ 404 */
326 if (atomic_dec_and_test(&r10_bio->remaining)) { 405 one_write_done(r10_bio);
327 /* clear the bitmap if all writes complete successfully */ 406 if (dec_rdev)
328 bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector, 407 rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
329 r10_bio->sectors,
330 !test_bit(R10BIO_Degraded, &r10_bio->state),
331 0);
332 md_write_end(r10_bio->mddev);
333 raid_end_bio_io(r10_bio);
334 }
335
336 rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
337} 408}
338 409
339 410
@@ -484,11 +555,12 @@ static int raid10_mergeable_bvec(struct request_queue *q,
484 * FIXME: possibly should rethink readbalancing and do it differently 555 * FIXME: possibly should rethink readbalancing and do it differently
485 * depending on near_copies / far_copies geometry. 556 * depending on near_copies / far_copies geometry.
486 */ 557 */
487static int read_balance(conf_t *conf, r10bio_t *r10_bio) 558static int read_balance(conf_t *conf, r10bio_t *r10_bio, int *max_sectors)
488{ 559{
489 const sector_t this_sector = r10_bio->sector; 560 const sector_t this_sector = r10_bio->sector;
490 int disk, slot; 561 int disk, slot;
491 const int sectors = r10_bio->sectors; 562 int sectors = r10_bio->sectors;
563 int best_good_sectors;
492 sector_t new_distance, best_dist; 564 sector_t new_distance, best_dist;
493 mdk_rdev_t *rdev; 565 mdk_rdev_t *rdev;
494 int do_balance; 566 int do_balance;
@@ -497,8 +569,10 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
497 raid10_find_phys(conf, r10_bio); 569 raid10_find_phys(conf, r10_bio);
498 rcu_read_lock(); 570 rcu_read_lock();
499retry: 571retry:
572 sectors = r10_bio->sectors;
500 best_slot = -1; 573 best_slot = -1;
501 best_dist = MaxSector; 574 best_dist = MaxSector;
575 best_good_sectors = 0;
502 do_balance = 1; 576 do_balance = 1;
503 /* 577 /*
504 * Check if we can balance. We can balance on the whole 578 * Check if we can balance. We can balance on the whole
@@ -511,6 +585,10 @@ retry:
511 do_balance = 0; 585 do_balance = 0;
512 586
513 for (slot = 0; slot < conf->copies ; slot++) { 587 for (slot = 0; slot < conf->copies ; slot++) {
588 sector_t first_bad;
589 int bad_sectors;
590 sector_t dev_sector;
591
514 if (r10_bio->devs[slot].bio == IO_BLOCKED) 592 if (r10_bio->devs[slot].bio == IO_BLOCKED)
515 continue; 593 continue;
516 disk = r10_bio->devs[slot].devnum; 594 disk = r10_bio->devs[slot].devnum;
@@ -520,6 +598,37 @@ retry:
520 if (!test_bit(In_sync, &rdev->flags)) 598 if (!test_bit(In_sync, &rdev->flags))
521 continue; 599 continue;
522 600
601 dev_sector = r10_bio->devs[slot].addr;
602 if (is_badblock(rdev, dev_sector, sectors,
603 &first_bad, &bad_sectors)) {
604 if (best_dist < MaxSector)
605 /* Already have a better slot */
606 continue;
607 if (first_bad <= dev_sector) {
608 /* Cannot read here. If this is the
609 * 'primary' device, then we must not read
610 * beyond 'bad_sectors' from another device.
611 */
612 bad_sectors -= (dev_sector - first_bad);
613 if (!do_balance && sectors > bad_sectors)
614 sectors = bad_sectors;
615 if (best_good_sectors > sectors)
616 best_good_sectors = sectors;
617 } else {
618 sector_t good_sectors =
619 first_bad - dev_sector;
620 if (good_sectors > best_good_sectors) {
621 best_good_sectors = good_sectors;
622 best_slot = slot;
623 }
624 if (!do_balance)
625 /* Must read from here */
626 break;
627 }
628 continue;
629 } else
630 best_good_sectors = sectors;
631
523 if (!do_balance) 632 if (!do_balance)
524 break; 633 break;
525 634
@@ -561,6 +670,7 @@ retry:
561 } else 670 } else
562 disk = -1; 671 disk = -1;
563 rcu_read_unlock(); 672 rcu_read_unlock();
673 *max_sectors = best_good_sectors;
564 674
565 return disk; 675 return disk;
566} 676}
@@ -734,6 +844,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
734 unsigned long flags; 844 unsigned long flags;
735 mdk_rdev_t *blocked_rdev; 845 mdk_rdev_t *blocked_rdev;
736 int plugged; 846 int plugged;
847 int sectors_handled;
848 int max_sectors;
737 849
738 if (unlikely(bio->bi_rw & REQ_FLUSH)) { 850 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
739 md_flush_request(mddev, bio); 851 md_flush_request(mddev, bio);
@@ -808,12 +920,26 @@ static int make_request(mddev_t *mddev, struct bio * bio)
808 r10_bio->sector = bio->bi_sector; 920 r10_bio->sector = bio->bi_sector;
809 r10_bio->state = 0; 921 r10_bio->state = 0;
810 922
923 /* We might need to issue multiple reads to different
924 * devices if there are bad blocks around, so we keep
925 * track of the number of reads in bio->bi_phys_segments.
926 * If this is 0, there is only one r10_bio and no locking
927 * will be needed when the request completes. If it is
928 * non-zero, then it is the number of not-completed requests.
929 */
930 bio->bi_phys_segments = 0;
931 clear_bit(BIO_SEG_VALID, &bio->bi_flags);
932
811 if (rw == READ) { 933 if (rw == READ) {
812 /* 934 /*
813 * read balancing logic: 935 * read balancing logic:
814 */ 936 */
815 int disk = read_balance(conf, r10_bio); 937 int disk;
816 int slot = r10_bio->read_slot; 938 int slot;
939
940read_again:
941 disk = read_balance(conf, r10_bio, &max_sectors);
942 slot = r10_bio->read_slot;
817 if (disk < 0) { 943 if (disk < 0) {
818 raid_end_bio_io(r10_bio); 944 raid_end_bio_io(r10_bio);
819 return 0; 945 return 0;
@@ -821,6 +947,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
821 mirror = conf->mirrors + disk; 947 mirror = conf->mirrors + disk;
822 948
823 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); 949 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
950 md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector,
951 max_sectors);
824 952
825 r10_bio->devs[slot].bio = read_bio; 953 r10_bio->devs[slot].bio = read_bio;
826 954
@@ -831,7 +959,37 @@ static int make_request(mddev_t *mddev, struct bio * bio)
831 read_bio->bi_rw = READ | do_sync; 959 read_bio->bi_rw = READ | do_sync;
832 read_bio->bi_private = r10_bio; 960 read_bio->bi_private = r10_bio;
833 961
834 generic_make_request(read_bio); 962 if (max_sectors < r10_bio->sectors) {
963 /* Could not read all from this device, so we will
964 * need another r10_bio.
965 */
966 sectors_handled = (r10_bio->sectors + max_sectors
967 - bio->bi_sector);
968 r10_bio->sectors = max_sectors;
969 spin_lock_irq(&conf->device_lock);
970 if (bio->bi_phys_segments == 0)
971 bio->bi_phys_segments = 2;
972 else
973 bio->bi_phys_segments++;
974 spin_unlock(&conf->device_lock);
975 /* Cannot call generic_make_request directly
976 * as that will be queued in __generic_make_request
977 * and subsequent mempool_alloc might block
978 * waiting for it. so hand bio over to raid10d.
979 */
980 reschedule_retry(r10_bio);
981
982 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
983
984 r10_bio->master_bio = bio;
985 r10_bio->sectors = ((bio->bi_size >> 9)
986 - sectors_handled);
987 r10_bio->state = 0;
988 r10_bio->mddev = mddev;
989 r10_bio->sector = bio->bi_sector + sectors_handled;
990 goto read_again;
991 } else
992 generic_make_request(read_bio);
835 return 0; 993 return 0;
836 } 994 }
837 995
@@ -841,13 +999,22 @@ static int make_request(mddev_t *mddev, struct bio * bio)
841 /* first select target devices under rcu_lock and 999 /* first select target devices under rcu_lock and
842 * inc refcount on their rdev. Record them by setting 1000 * inc refcount on their rdev. Record them by setting
843 * bios[x] to bio 1001 * bios[x] to bio
1002 * If there are known/acknowledged bad blocks on any device
1003 * on which we have seen a write error, we want to avoid
1004 * writing to those blocks. This potentially requires several
1005 * writes to write around the bad blocks. Each set of writes
1006 * gets its own r10_bio with a set of bios attached. The number
1007 * of r10_bios is recored in bio->bi_phys_segments just as with
1008 * the read case.
844 */ 1009 */
845 plugged = mddev_check_plugged(mddev); 1010 plugged = mddev_check_plugged(mddev);
846 1011
847 raid10_find_phys(conf, r10_bio); 1012 raid10_find_phys(conf, r10_bio);
848 retry_write: 1013retry_write:
849 blocked_rdev = NULL; 1014 blocked_rdev = NULL;
850 rcu_read_lock(); 1015 rcu_read_lock();
1016 max_sectors = r10_bio->sectors;
1017
851 for (i = 0; i < conf->copies; i++) { 1018 for (i = 0; i < conf->copies; i++) {
852 int d = r10_bio->devs[i].devnum; 1019 int d = r10_bio->devs[i].devnum;
853 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev); 1020 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev);
@@ -856,13 +1023,55 @@ static int make_request(mddev_t *mddev, struct bio * bio)
856 blocked_rdev = rdev; 1023 blocked_rdev = rdev;
857 break; 1024 break;
858 } 1025 }
859 if (rdev && !test_bit(Faulty, &rdev->flags)) { 1026 r10_bio->devs[i].bio = NULL;
860 atomic_inc(&rdev->nr_pending); 1027 if (!rdev || test_bit(Faulty, &rdev->flags)) {
861 r10_bio->devs[i].bio = bio;
862 } else {
863 r10_bio->devs[i].bio = NULL;
864 set_bit(R10BIO_Degraded, &r10_bio->state); 1028 set_bit(R10BIO_Degraded, &r10_bio->state);
1029 continue;
1030 }
1031 if (test_bit(WriteErrorSeen, &rdev->flags)) {
1032 sector_t first_bad;
1033 sector_t dev_sector = r10_bio->devs[i].addr;
1034 int bad_sectors;
1035 int is_bad;
1036
1037 is_bad = is_badblock(rdev, dev_sector,
1038 max_sectors,
1039 &first_bad, &bad_sectors);
1040 if (is_bad < 0) {
1041 /* Mustn't write here until the bad block
1042 * is acknowledged
1043 */
1044 atomic_inc(&rdev->nr_pending);
1045 set_bit(BlockedBadBlocks, &rdev->flags);
1046 blocked_rdev = rdev;
1047 break;
1048 }
1049 if (is_bad && first_bad <= dev_sector) {
1050 /* Cannot write here at all */
1051 bad_sectors -= (dev_sector - first_bad);
1052 if (bad_sectors < max_sectors)
1053 /* Mustn't write more than bad_sectors
1054 * to other devices yet
1055 */
1056 max_sectors = bad_sectors;
1057 /* We don't set R10BIO_Degraded as that
1058 * only applies if the disk is missing,
1059 * so it might be re-added, and we want to
1060 * know to recover this chunk.
1061 * In this case the device is here, and the
1062 * fact that this chunk is not in-sync is
1063 * recorded in the bad block log.
1064 */
1065 continue;
1066 }
1067 if (is_bad) {
1068 int good_sectors = first_bad - dev_sector;
1069 if (good_sectors < max_sectors)
1070 max_sectors = good_sectors;
1071 }
865 } 1072 }
1073 r10_bio->devs[i].bio = bio;
1074 atomic_inc(&rdev->nr_pending);
866 } 1075 }
867 rcu_read_unlock(); 1076 rcu_read_unlock();
868 1077
@@ -882,8 +1091,22 @@ static int make_request(mddev_t *mddev, struct bio * bio)
882 goto retry_write; 1091 goto retry_write;
883 } 1092 }
884 1093
1094 if (max_sectors < r10_bio->sectors) {
1095 /* We are splitting this into multiple parts, so
1096 * we need to prepare for allocating another r10_bio.
1097 */
1098 r10_bio->sectors = max_sectors;
1099 spin_lock_irq(&conf->device_lock);
1100 if (bio->bi_phys_segments == 0)
1101 bio->bi_phys_segments = 2;
1102 else
1103 bio->bi_phys_segments++;
1104 spin_unlock_irq(&conf->device_lock);
1105 }
1106 sectors_handled = r10_bio->sector + max_sectors - bio->bi_sector;
1107
885 atomic_set(&r10_bio->remaining, 1); 1108 atomic_set(&r10_bio->remaining, 1);
886 bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0); 1109 bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
887 1110
888 for (i = 0; i < conf->copies; i++) { 1111 for (i = 0; i < conf->copies; i++) {
889 struct bio *mbio; 1112 struct bio *mbio;
@@ -892,10 +1115,12 @@ static int make_request(mddev_t *mddev, struct bio * bio)
892 continue; 1115 continue;
893 1116
894 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); 1117 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1118 md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
1119 max_sectors);
895 r10_bio->devs[i].bio = mbio; 1120 r10_bio->devs[i].bio = mbio;
896 1121
897 mbio->bi_sector = r10_bio->devs[i].addr+ 1122 mbio->bi_sector = (r10_bio->devs[i].addr+
898 conf->mirrors[d].rdev->data_offset; 1123 conf->mirrors[d].rdev->data_offset);
899 mbio->bi_bdev = conf->mirrors[d].rdev->bdev; 1124 mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
900 mbio->bi_end_io = raid10_end_write_request; 1125 mbio->bi_end_io = raid10_end_write_request;
901 mbio->bi_rw = WRITE | do_sync | do_fua; 1126 mbio->bi_rw = WRITE | do_sync | do_fua;
@@ -907,15 +1132,26 @@ static int make_request(mddev_t *mddev, struct bio * bio)
907 spin_unlock_irqrestore(&conf->device_lock, flags); 1132 spin_unlock_irqrestore(&conf->device_lock, flags);
908 } 1133 }
909 1134
910 if (atomic_dec_and_test(&r10_bio->remaining)) { 1135 /* Don't remove the bias on 'remaining' (one_write_done) until
911 /* This matches the end of raid10_end_write_request() */ 1136 * after checking if we need to go around again.
912 bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector, 1137 */
913 r10_bio->sectors, 1138
914 !test_bit(R10BIO_Degraded, &r10_bio->state), 1139 if (sectors_handled < (bio->bi_size >> 9)) {
915 0); 1140 one_write_done(r10_bio);
916 md_write_end(mddev); 1141 /* We need another r10_bio. It has already been counted
917 raid_end_bio_io(r10_bio); 1142 * in bio->bi_phys_segments.
1143 */
1144 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1145
1146 r10_bio->master_bio = bio;
1147 r10_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
1148
1149 r10_bio->mddev = mddev;
1150 r10_bio->sector = bio->bi_sector + sectors_handled;
1151 r10_bio->state = 0;
1152 goto retry_write;
918 } 1153 }
1154 one_write_done(r10_bio);
919 1155
920 /* In case raid10d snuck in to freeze_array */ 1156 /* In case raid10d snuck in to freeze_array */
921 wake_up(&conf->wait_barrier); 1157 wake_up(&conf->wait_barrier);
@@ -949,6 +1185,30 @@ static void status(struct seq_file *seq, mddev_t *mddev)
949 seq_printf(seq, "]"); 1185 seq_printf(seq, "]");
950} 1186}
951 1187
1188/* check if there are enough drives for
1189 * every block to appear on atleast one.
1190 * Don't consider the device numbered 'ignore'
1191 * as we might be about to remove it.
1192 */
1193static int enough(conf_t *conf, int ignore)
1194{
1195 int first = 0;
1196
1197 do {
1198 int n = conf->copies;
1199 int cnt = 0;
1200 while (n--) {
1201 if (conf->mirrors[first].rdev &&
1202 first != ignore)
1203 cnt++;
1204 first = (first+1) % conf->raid_disks;
1205 }
1206 if (cnt == 0)
1207 return 0;
1208 } while (first != 0);
1209 return 1;
1210}
1211
952static void error(mddev_t *mddev, mdk_rdev_t *rdev) 1212static void error(mddev_t *mddev, mdk_rdev_t *rdev)
953{ 1213{
954 char b[BDEVNAME_SIZE]; 1214 char b[BDEVNAME_SIZE];
@@ -961,13 +1221,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
961 * else mark the drive as failed 1221 * else mark the drive as failed
962 */ 1222 */
963 if (test_bit(In_sync, &rdev->flags) 1223 if (test_bit(In_sync, &rdev->flags)
964 && conf->raid_disks-mddev->degraded == 1) 1224 && !enough(conf, rdev->raid_disk))
965 /* 1225 /*
966 * Don't fail the drive, just return an IO error. 1226 * Don't fail the drive, just return an IO error.
967 * The test should really be more sophisticated than
968 * "working_disks == 1", but it isn't critical, and
969 * can wait until we do more sophisticated "is the drive
970 * really dead" tests...
971 */ 1227 */
972 return; 1228 return;
973 if (test_and_clear_bit(In_sync, &rdev->flags)) { 1229 if (test_and_clear_bit(In_sync, &rdev->flags)) {
@@ -980,6 +1236,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
980 */ 1236 */
981 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 1237 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
982 } 1238 }
1239 set_bit(Blocked, &rdev->flags);
983 set_bit(Faulty, &rdev->flags); 1240 set_bit(Faulty, &rdev->flags);
984 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1241 set_bit(MD_CHANGE_DEVS, &mddev->flags);
985 printk(KERN_ALERT 1242 printk(KERN_ALERT
@@ -1022,27 +1279,6 @@ static void close_sync(conf_t *conf)
1022 conf->r10buf_pool = NULL; 1279 conf->r10buf_pool = NULL;
1023} 1280}
1024 1281
1025/* check if there are enough drives for
1026 * every block to appear on atleast one
1027 */
1028static int enough(conf_t *conf)
1029{
1030 int first = 0;
1031
1032 do {
1033 int n = conf->copies;
1034 int cnt = 0;
1035 while (n--) {
1036 if (conf->mirrors[first].rdev)
1037 cnt++;
1038 first = (first+1) % conf->raid_disks;
1039 }
1040 if (cnt == 0)
1041 return 0;
1042 } while (first != 0);
1043 return 1;
1044}
1045
1046static int raid10_spare_active(mddev_t *mddev) 1282static int raid10_spare_active(mddev_t *mddev)
1047{ 1283{
1048 int i; 1284 int i;
@@ -1078,7 +1314,6 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1078 conf_t *conf = mddev->private; 1314 conf_t *conf = mddev->private;
1079 int err = -EEXIST; 1315 int err = -EEXIST;
1080 int mirror; 1316 int mirror;
1081 mirror_info_t *p;
1082 int first = 0; 1317 int first = 0;
1083 int last = conf->raid_disks - 1; 1318 int last = conf->raid_disks - 1;
1084 1319
@@ -1087,44 +1322,47 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1087 * very different from resync 1322 * very different from resync
1088 */ 1323 */
1089 return -EBUSY; 1324 return -EBUSY;
1090 if (!enough(conf)) 1325 if (!enough(conf, -1))
1091 return -EINVAL; 1326 return -EINVAL;
1092 1327
1093 if (rdev->raid_disk >= 0) 1328 if (rdev->raid_disk >= 0)
1094 first = last = rdev->raid_disk; 1329 first = last = rdev->raid_disk;
1095 1330
1096 if (rdev->saved_raid_disk >= 0 && 1331 if (rdev->saved_raid_disk >= first &&
1097 rdev->saved_raid_disk >= first &&
1098 conf->mirrors[rdev->saved_raid_disk].rdev == NULL) 1332 conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1099 mirror = rdev->saved_raid_disk; 1333 mirror = rdev->saved_raid_disk;
1100 else 1334 else
1101 mirror = first; 1335 mirror = first;
1102 for ( ; mirror <= last ; mirror++) 1336 for ( ; mirror <= last ; mirror++) {
1103 if ( !(p=conf->mirrors+mirror)->rdev) { 1337 mirror_info_t *p = &conf->mirrors[mirror];
1104 1338 if (p->recovery_disabled == mddev->recovery_disabled)
1105 disk_stack_limits(mddev->gendisk, rdev->bdev, 1339 continue;
1106 rdev->data_offset << 9); 1340 if (p->rdev)
1107 /* as we don't honour merge_bvec_fn, we must 1341 continue;
1108 * never risk violating it, so limit
1109 * ->max_segments to one lying with a single
1110 * page, as a one page request is never in
1111 * violation.
1112 */
1113 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
1114 blk_queue_max_segments(mddev->queue, 1);
1115 blk_queue_segment_boundary(mddev->queue,
1116 PAGE_CACHE_SIZE - 1);
1117 }
1118 1342
1119 p->head_position = 0; 1343 disk_stack_limits(mddev->gendisk, rdev->bdev,
1120 rdev->raid_disk = mirror; 1344 rdev->data_offset << 9);
1121 err = 0; 1345 /* as we don't honour merge_bvec_fn, we must
1122 if (rdev->saved_raid_disk != mirror) 1346 * never risk violating it, so limit
1123 conf->fullsync = 1; 1347 * ->max_segments to one lying with a single
1124 rcu_assign_pointer(p->rdev, rdev); 1348 * page, as a one page request is never in
1125 break; 1349 * violation.
1350 */
1351 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
1352 blk_queue_max_segments(mddev->queue, 1);
1353 blk_queue_segment_boundary(mddev->queue,
1354 PAGE_CACHE_SIZE - 1);
1126 } 1355 }
1127 1356
1357 p->head_position = 0;
1358 rdev->raid_disk = mirror;
1359 err = 0;
1360 if (rdev->saved_raid_disk != mirror)
1361 conf->fullsync = 1;
1362 rcu_assign_pointer(p->rdev, rdev);
1363 break;
1364 }
1365
1128 md_integrity_add_rdev(rdev, mddev); 1366 md_integrity_add_rdev(rdev, mddev);
1129 print_conf(conf); 1367 print_conf(conf);
1130 return err; 1368 return err;
@@ -1149,7 +1387,8 @@ static int raid10_remove_disk(mddev_t *mddev, int number)
1149 * is not possible. 1387 * is not possible.
1150 */ 1388 */
1151 if (!test_bit(Faulty, &rdev->flags) && 1389 if (!test_bit(Faulty, &rdev->flags) &&
1152 enough(conf)) { 1390 mddev->recovery_disabled != p->recovery_disabled &&
1391 enough(conf, -1)) {
1153 err = -EBUSY; 1392 err = -EBUSY;
1154 goto abort; 1393 goto abort;
1155 } 1394 }
@@ -1174,24 +1413,18 @@ static void end_sync_read(struct bio *bio, int error)
1174{ 1413{
1175 r10bio_t *r10_bio = bio->bi_private; 1414 r10bio_t *r10_bio = bio->bi_private;
1176 conf_t *conf = r10_bio->mddev->private; 1415 conf_t *conf = r10_bio->mddev->private;
1177 int i,d; 1416 int d;
1178 1417
1179 for (i=0; i<conf->copies; i++) 1418 d = find_bio_disk(conf, r10_bio, bio, NULL);
1180 if (r10_bio->devs[i].bio == bio)
1181 break;
1182 BUG_ON(i == conf->copies);
1183 update_head_pos(i, r10_bio);
1184 d = r10_bio->devs[i].devnum;
1185 1419
1186 if (test_bit(BIO_UPTODATE, &bio->bi_flags)) 1420 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
1187 set_bit(R10BIO_Uptodate, &r10_bio->state); 1421 set_bit(R10BIO_Uptodate, &r10_bio->state);
1188 else { 1422 else
1423 /* The write handler will notice the lack of
1424 * R10BIO_Uptodate and record any errors etc
1425 */
1189 atomic_add(r10_bio->sectors, 1426 atomic_add(r10_bio->sectors,
1190 &conf->mirrors[d].rdev->corrected_errors); 1427 &conf->mirrors[d].rdev->corrected_errors);
1191 if (!test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
1192 md_error(r10_bio->mddev,
1193 conf->mirrors[d].rdev);
1194 }
1195 1428
1196 /* for reconstruct, we always reschedule after a read. 1429 /* for reconstruct, we always reschedule after a read.
1197 * for resync, only after all reads 1430 * for resync, only after all reads
@@ -1206,40 +1439,60 @@ static void end_sync_read(struct bio *bio, int error)
1206 } 1439 }
1207} 1440}
1208 1441
1209static void end_sync_write(struct bio *bio, int error) 1442static void end_sync_request(r10bio_t *r10_bio)
1210{ 1443{
1211 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1212 r10bio_t *r10_bio = bio->bi_private;
1213 mddev_t *mddev = r10_bio->mddev; 1444 mddev_t *mddev = r10_bio->mddev;
1214 conf_t *conf = mddev->private;
1215 int i,d;
1216
1217 for (i = 0; i < conf->copies; i++)
1218 if (r10_bio->devs[i].bio == bio)
1219 break;
1220 d = r10_bio->devs[i].devnum;
1221
1222 if (!uptodate)
1223 md_error(mddev, conf->mirrors[d].rdev);
1224
1225 update_head_pos(i, r10_bio);
1226 1445
1227 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1228 while (atomic_dec_and_test(&r10_bio->remaining)) { 1446 while (atomic_dec_and_test(&r10_bio->remaining)) {
1229 if (r10_bio->master_bio == NULL) { 1447 if (r10_bio->master_bio == NULL) {
1230 /* the primary of several recovery bios */ 1448 /* the primary of several recovery bios */
1231 sector_t s = r10_bio->sectors; 1449 sector_t s = r10_bio->sectors;
1232 put_buf(r10_bio); 1450 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1451 test_bit(R10BIO_WriteError, &r10_bio->state))
1452 reschedule_retry(r10_bio);
1453 else
1454 put_buf(r10_bio);
1233 md_done_sync(mddev, s, 1); 1455 md_done_sync(mddev, s, 1);
1234 break; 1456 break;
1235 } else { 1457 } else {
1236 r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio; 1458 r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio;
1237 put_buf(r10_bio); 1459 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1460 test_bit(R10BIO_WriteError, &r10_bio->state))
1461 reschedule_retry(r10_bio);
1462 else
1463 put_buf(r10_bio);
1238 r10_bio = r10_bio2; 1464 r10_bio = r10_bio2;
1239 } 1465 }
1240 } 1466 }
1241} 1467}
1242 1468
1469static void end_sync_write(struct bio *bio, int error)
1470{
1471 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1472 r10bio_t *r10_bio = bio->bi_private;
1473 mddev_t *mddev = r10_bio->mddev;
1474 conf_t *conf = mddev->private;
1475 int d;
1476 sector_t first_bad;
1477 int bad_sectors;
1478 int slot;
1479
1480 d = find_bio_disk(conf, r10_bio, bio, &slot);
1481
1482 if (!uptodate) {
1483 set_bit(WriteErrorSeen, &conf->mirrors[d].rdev->flags);
1484 set_bit(R10BIO_WriteError, &r10_bio->state);
1485 } else if (is_badblock(conf->mirrors[d].rdev,
1486 r10_bio->devs[slot].addr,
1487 r10_bio->sectors,
1488 &first_bad, &bad_sectors))
1489 set_bit(R10BIO_MadeGood, &r10_bio->state);
1490
1491 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1492
1493 end_sync_request(r10_bio);
1494}
1495
1243/* 1496/*
1244 * Note: sync and recover and handled very differently for raid10 1497 * Note: sync and recover and handled very differently for raid10
1245 * This code is for resync. 1498 * This code is for resync.
@@ -1299,11 +1552,12 @@ static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio)
1299 if (j == vcnt) 1552 if (j == vcnt)
1300 continue; 1553 continue;
1301 mddev->resync_mismatches += r10_bio->sectors; 1554 mddev->resync_mismatches += r10_bio->sectors;
1555 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
1556 /* Don't fix anything. */
1557 continue;
1302 } 1558 }
1303 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 1559 /* Ok, we need to write this bio, either to correct an
1304 /* Don't fix anything. */ 1560 * inconsistency or to correct an unreadable block.
1305 continue;
1306 /* Ok, we need to write this bio
1307 * First we need to fixup bv_offset, bv_len and 1561 * First we need to fixup bv_offset, bv_len and
1308 * bi_vecs, as the read request might have corrupted these 1562 * bi_vecs, as the read request might have corrupted these
1309 */ 1563 */
@@ -1355,32 +1609,107 @@ done:
1355 * The second for writing. 1609 * The second for writing.
1356 * 1610 *
1357 */ 1611 */
1612static void fix_recovery_read_error(r10bio_t *r10_bio)
1613{
1614 /* We got a read error during recovery.
1615 * We repeat the read in smaller page-sized sections.
1616 * If a read succeeds, write it to the new device or record
1617 * a bad block if we cannot.
1618 * If a read fails, record a bad block on both old and
1619 * new devices.
1620 */
1621 mddev_t *mddev = r10_bio->mddev;
1622 conf_t *conf = mddev->private;
1623 struct bio *bio = r10_bio->devs[0].bio;
1624 sector_t sect = 0;
1625 int sectors = r10_bio->sectors;
1626 int idx = 0;
1627 int dr = r10_bio->devs[0].devnum;
1628 int dw = r10_bio->devs[1].devnum;
1629
1630 while (sectors) {
1631 int s = sectors;
1632 mdk_rdev_t *rdev;
1633 sector_t addr;
1634 int ok;
1635
1636 if (s > (PAGE_SIZE>>9))
1637 s = PAGE_SIZE >> 9;
1638
1639 rdev = conf->mirrors[dr].rdev;
1640 addr = r10_bio->devs[0].addr + sect,
1641 ok = sync_page_io(rdev,
1642 addr,
1643 s << 9,
1644 bio->bi_io_vec[idx].bv_page,
1645 READ, false);
1646 if (ok) {
1647 rdev = conf->mirrors[dw].rdev;
1648 addr = r10_bio->devs[1].addr + sect;
1649 ok = sync_page_io(rdev,
1650 addr,
1651 s << 9,
1652 bio->bi_io_vec[idx].bv_page,
1653 WRITE, false);
1654 if (!ok)
1655 set_bit(WriteErrorSeen, &rdev->flags);
1656 }
1657 if (!ok) {
1658 /* We don't worry if we cannot set a bad block -
1659 * it really is bad so there is no loss in not
1660 * recording it yet
1661 */
1662 rdev_set_badblocks(rdev, addr, s, 0);
1663
1664 if (rdev != conf->mirrors[dw].rdev) {
1665 /* need bad block on destination too */
1666 mdk_rdev_t *rdev2 = conf->mirrors[dw].rdev;
1667 addr = r10_bio->devs[1].addr + sect;
1668 ok = rdev_set_badblocks(rdev2, addr, s, 0);
1669 if (!ok) {
1670 /* just abort the recovery */
1671 printk(KERN_NOTICE
1672 "md/raid10:%s: recovery aborted"
1673 " due to read error\n",
1674 mdname(mddev));
1675
1676 conf->mirrors[dw].recovery_disabled
1677 = mddev->recovery_disabled;
1678 set_bit(MD_RECOVERY_INTR,
1679 &mddev->recovery);
1680 break;
1681 }
1682 }
1683 }
1684
1685 sectors -= s;
1686 sect += s;
1687 idx++;
1688 }
1689}
1358 1690
1359static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio) 1691static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
1360{ 1692{
1361 conf_t *conf = mddev->private; 1693 conf_t *conf = mddev->private;
1362 int i, d; 1694 int d;
1363 struct bio *bio, *wbio; 1695 struct bio *wbio;
1364 1696
1697 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
1698 fix_recovery_read_error(r10_bio);
1699 end_sync_request(r10_bio);
1700 return;
1701 }
1365 1702
1366 /* move the pages across to the second bio 1703 /*
1704 * share the pages with the first bio
1367 * and submit the write request 1705 * and submit the write request
1368 */ 1706 */
1369 bio = r10_bio->devs[0].bio;
1370 wbio = r10_bio->devs[1].bio; 1707 wbio = r10_bio->devs[1].bio;
1371 for (i=0; i < wbio->bi_vcnt; i++) {
1372 struct page *p = bio->bi_io_vec[i].bv_page;
1373 bio->bi_io_vec[i].bv_page = wbio->bi_io_vec[i].bv_page;
1374 wbio->bi_io_vec[i].bv_page = p;
1375 }
1376 d = r10_bio->devs[1].devnum; 1708 d = r10_bio->devs[1].devnum;
1377 1709
1378 atomic_inc(&conf->mirrors[d].rdev->nr_pending); 1710 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1379 md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9); 1711 md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
1380 if (test_bit(R10BIO_Uptodate, &r10_bio->state)) 1712 generic_make_request(wbio);
1381 generic_make_request(wbio);
1382 else
1383 bio_endio(wbio, -EIO);
1384} 1713}
1385 1714
1386 1715
@@ -1421,6 +1750,26 @@ static void check_decay_read_errors(mddev_t *mddev, mdk_rdev_t *rdev)
1421 atomic_set(&rdev->read_errors, read_errors >> hours_since_last); 1750 atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
1422} 1751}
1423 1752
1753static int r10_sync_page_io(mdk_rdev_t *rdev, sector_t sector,
1754 int sectors, struct page *page, int rw)
1755{
1756 sector_t first_bad;
1757 int bad_sectors;
1758
1759 if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors)
1760 && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags)))
1761 return -1;
1762 if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
1763 /* success */
1764 return 1;
1765 if (rw == WRITE)
1766 set_bit(WriteErrorSeen, &rdev->flags);
1767 /* need to record an error - either for the block or the device */
1768 if (!rdev_set_badblocks(rdev, sector, sectors, 0))
1769 md_error(rdev->mddev, rdev);
1770 return 0;
1771}
1772
1424/* 1773/*
1425 * This is a kernel thread which: 1774 * This is a kernel thread which:
1426 * 1775 *
@@ -1476,10 +1825,15 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1476 1825
1477 rcu_read_lock(); 1826 rcu_read_lock();
1478 do { 1827 do {
1828 sector_t first_bad;
1829 int bad_sectors;
1830
1479 d = r10_bio->devs[sl].devnum; 1831 d = r10_bio->devs[sl].devnum;
1480 rdev = rcu_dereference(conf->mirrors[d].rdev); 1832 rdev = rcu_dereference(conf->mirrors[d].rdev);
1481 if (rdev && 1833 if (rdev &&
1482 test_bit(In_sync, &rdev->flags)) { 1834 test_bit(In_sync, &rdev->flags) &&
1835 is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
1836 &first_bad, &bad_sectors) == 0) {
1483 atomic_inc(&rdev->nr_pending); 1837 atomic_inc(&rdev->nr_pending);
1484 rcu_read_unlock(); 1838 rcu_read_unlock();
1485 success = sync_page_io(rdev, 1839 success = sync_page_io(rdev,
@@ -1499,9 +1853,19 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1499 rcu_read_unlock(); 1853 rcu_read_unlock();
1500 1854
1501 if (!success) { 1855 if (!success) {
1502 /* Cannot read from anywhere -- bye bye array */ 1856 /* Cannot read from anywhere, just mark the block
1857 * as bad on the first device to discourage future
1858 * reads.
1859 */
1503 int dn = r10_bio->devs[r10_bio->read_slot].devnum; 1860 int dn = r10_bio->devs[r10_bio->read_slot].devnum;
1504 md_error(mddev, conf->mirrors[dn].rdev); 1861 rdev = conf->mirrors[dn].rdev;
1862
1863 if (!rdev_set_badblocks(
1864 rdev,
1865 r10_bio->devs[r10_bio->read_slot].addr
1866 + sect,
1867 s, 0))
1868 md_error(mddev, rdev);
1505 break; 1869 break;
1506 } 1870 }
1507 1871
@@ -1516,80 +1880,82 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1516 sl--; 1880 sl--;
1517 d = r10_bio->devs[sl].devnum; 1881 d = r10_bio->devs[sl].devnum;
1518 rdev = rcu_dereference(conf->mirrors[d].rdev); 1882 rdev = rcu_dereference(conf->mirrors[d].rdev);
1519 if (rdev && 1883 if (!rdev ||
1520 test_bit(In_sync, &rdev->flags)) { 1884 !test_bit(In_sync, &rdev->flags))
1521 atomic_inc(&rdev->nr_pending); 1885 continue;
1522 rcu_read_unlock(); 1886
1523 atomic_add(s, &rdev->corrected_errors); 1887 atomic_inc(&rdev->nr_pending);
1524 if (sync_page_io(rdev, 1888 rcu_read_unlock();
1525 r10_bio->devs[sl].addr + 1889 if (r10_sync_page_io(rdev,
1526 sect, 1890 r10_bio->devs[sl].addr +
1527 s<<9, conf->tmppage, WRITE, false) 1891 sect,
1528 == 0) { 1892 s<<9, conf->tmppage, WRITE)
1529 /* Well, this device is dead */ 1893 == 0) {
1530 printk(KERN_NOTICE 1894 /* Well, this device is dead */
1531 "md/raid10:%s: read correction " 1895 printk(KERN_NOTICE
1532 "write failed" 1896 "md/raid10:%s: read correction "
1533 " (%d sectors at %llu on %s)\n", 1897 "write failed"
1534 mdname(mddev), s, 1898 " (%d sectors at %llu on %s)\n",
1535 (unsigned long long)( 1899 mdname(mddev), s,
1536 sect + rdev->data_offset), 1900 (unsigned long long)(
1537 bdevname(rdev->bdev, b)); 1901 sect + rdev->data_offset),
1538 printk(KERN_NOTICE "md/raid10:%s: %s: failing " 1902 bdevname(rdev->bdev, b));
1539 "drive\n", 1903 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
1540 mdname(mddev), 1904 "drive\n",
1541 bdevname(rdev->bdev, b)); 1905 mdname(mddev),
1542 md_error(mddev, rdev); 1906 bdevname(rdev->bdev, b));
1543 }
1544 rdev_dec_pending(rdev, mddev);
1545 rcu_read_lock();
1546 } 1907 }
1908 rdev_dec_pending(rdev, mddev);
1909 rcu_read_lock();
1547 } 1910 }
1548 sl = start; 1911 sl = start;
1549 while (sl != r10_bio->read_slot) { 1912 while (sl != r10_bio->read_slot) {
1913 char b[BDEVNAME_SIZE];
1550 1914
1551 if (sl==0) 1915 if (sl==0)
1552 sl = conf->copies; 1916 sl = conf->copies;
1553 sl--; 1917 sl--;
1554 d = r10_bio->devs[sl].devnum; 1918 d = r10_bio->devs[sl].devnum;
1555 rdev = rcu_dereference(conf->mirrors[d].rdev); 1919 rdev = rcu_dereference(conf->mirrors[d].rdev);
1556 if (rdev && 1920 if (!rdev ||
1557 test_bit(In_sync, &rdev->flags)) { 1921 !test_bit(In_sync, &rdev->flags))
1558 char b[BDEVNAME_SIZE]; 1922 continue;
1559 atomic_inc(&rdev->nr_pending);
1560 rcu_read_unlock();
1561 if (sync_page_io(rdev,
1562 r10_bio->devs[sl].addr +
1563 sect,
1564 s<<9, conf->tmppage,
1565 READ, false) == 0) {
1566 /* Well, this device is dead */
1567 printk(KERN_NOTICE
1568 "md/raid10:%s: unable to read back "
1569 "corrected sectors"
1570 " (%d sectors at %llu on %s)\n",
1571 mdname(mddev), s,
1572 (unsigned long long)(
1573 sect + rdev->data_offset),
1574 bdevname(rdev->bdev, b));
1575 printk(KERN_NOTICE "md/raid10:%s: %s: failing drive\n",
1576 mdname(mddev),
1577 bdevname(rdev->bdev, b));
1578
1579 md_error(mddev, rdev);
1580 } else {
1581 printk(KERN_INFO
1582 "md/raid10:%s: read error corrected"
1583 " (%d sectors at %llu on %s)\n",
1584 mdname(mddev), s,
1585 (unsigned long long)(
1586 sect + rdev->data_offset),
1587 bdevname(rdev->bdev, b));
1588 }
1589 1923
1590 rdev_dec_pending(rdev, mddev); 1924 atomic_inc(&rdev->nr_pending);
1591 rcu_read_lock(); 1925 rcu_read_unlock();
1926 switch (r10_sync_page_io(rdev,
1927 r10_bio->devs[sl].addr +
1928 sect,
1929 s<<9, conf->tmppage,
1930 READ)) {
1931 case 0:
1932 /* Well, this device is dead */
1933 printk(KERN_NOTICE
1934 "md/raid10:%s: unable to read back "
1935 "corrected sectors"
1936 " (%d sectors at %llu on %s)\n",
1937 mdname(mddev), s,
1938 (unsigned long long)(
1939 sect + rdev->data_offset),
1940 bdevname(rdev->bdev, b));
1941 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
1942 "drive\n",
1943 mdname(mddev),
1944 bdevname(rdev->bdev, b));
1945 break;
1946 case 1:
1947 printk(KERN_INFO
1948 "md/raid10:%s: read error corrected"
1949 " (%d sectors at %llu on %s)\n",
1950 mdname(mddev), s,
1951 (unsigned long long)(
1952 sect + rdev->data_offset),
1953 bdevname(rdev->bdev, b));
1954 atomic_add(s, &rdev->corrected_errors);
1592 } 1955 }
1956
1957 rdev_dec_pending(rdev, mddev);
1958 rcu_read_lock();
1593 } 1959 }
1594 rcu_read_unlock(); 1960 rcu_read_unlock();
1595 1961
@@ -1598,21 +1964,254 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1598 } 1964 }
1599} 1965}
1600 1966
1967static void bi_complete(struct bio *bio, int error)
1968{
1969 complete((struct completion *)bio->bi_private);
1970}
1971
1972static int submit_bio_wait(int rw, struct bio *bio)
1973{
1974 struct completion event;
1975 rw |= REQ_SYNC;
1976
1977 init_completion(&event);
1978 bio->bi_private = &event;
1979 bio->bi_end_io = bi_complete;
1980 submit_bio(rw, bio);
1981 wait_for_completion(&event);
1982
1983 return test_bit(BIO_UPTODATE, &bio->bi_flags);
1984}
1985
1986static int narrow_write_error(r10bio_t *r10_bio, int i)
1987{
1988 struct bio *bio = r10_bio->master_bio;
1989 mddev_t *mddev = r10_bio->mddev;
1990 conf_t *conf = mddev->private;
1991 mdk_rdev_t *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev;
1992 /* bio has the data to be written to slot 'i' where
1993 * we just recently had a write error.
1994 * We repeatedly clone the bio and trim down to one block,
1995 * then try the write. Where the write fails we record
1996 * a bad block.
1997 * It is conceivable that the bio doesn't exactly align with
1998 * blocks. We must handle this.
1999 *
2000 * We currently own a reference to the rdev.
2001 */
2002
2003 int block_sectors;
2004 sector_t sector;
2005 int sectors;
2006 int sect_to_write = r10_bio->sectors;
2007 int ok = 1;
2008
2009 if (rdev->badblocks.shift < 0)
2010 return 0;
2011
2012 block_sectors = 1 << rdev->badblocks.shift;
2013 sector = r10_bio->sector;
2014 sectors = ((r10_bio->sector + block_sectors)
2015 & ~(sector_t)(block_sectors - 1))
2016 - sector;
2017
2018 while (sect_to_write) {
2019 struct bio *wbio;
2020 if (sectors > sect_to_write)
2021 sectors = sect_to_write;
2022 /* Write at 'sector' for 'sectors' */
2023 wbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
2024 md_trim_bio(wbio, sector - bio->bi_sector, sectors);
2025 wbio->bi_sector = (r10_bio->devs[i].addr+
2026 rdev->data_offset+
2027 (sector - r10_bio->sector));
2028 wbio->bi_bdev = rdev->bdev;
2029 if (submit_bio_wait(WRITE, wbio) == 0)
2030 /* Failure! */
2031 ok = rdev_set_badblocks(rdev, sector,
2032 sectors, 0)
2033 && ok;
2034
2035 bio_put(wbio);
2036 sect_to_write -= sectors;
2037 sector += sectors;
2038 sectors = block_sectors;
2039 }
2040 return ok;
2041}
2042
2043static void handle_read_error(mddev_t *mddev, r10bio_t *r10_bio)
2044{
2045 int slot = r10_bio->read_slot;
2046 int mirror = r10_bio->devs[slot].devnum;
2047 struct bio *bio;
2048 conf_t *conf = mddev->private;
2049 mdk_rdev_t *rdev;
2050 char b[BDEVNAME_SIZE];
2051 unsigned long do_sync;
2052 int max_sectors;
2053
2054 /* we got a read error. Maybe the drive is bad. Maybe just
2055 * the block and we can fix it.
2056 * We freeze all other IO, and try reading the block from
2057 * other devices. When we find one, we re-write
2058 * and check it that fixes the read error.
2059 * This is all done synchronously while the array is
2060 * frozen.
2061 */
2062 if (mddev->ro == 0) {
2063 freeze_array(conf);
2064 fix_read_error(conf, mddev, r10_bio);
2065 unfreeze_array(conf);
2066 }
2067 rdev_dec_pending(conf->mirrors[mirror].rdev, mddev);
2068
2069 bio = r10_bio->devs[slot].bio;
2070 bdevname(bio->bi_bdev, b);
2071 r10_bio->devs[slot].bio =
2072 mddev->ro ? IO_BLOCKED : NULL;
2073read_more:
2074 mirror = read_balance(conf, r10_bio, &max_sectors);
2075 if (mirror == -1) {
2076 printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
2077 " read error for block %llu\n",
2078 mdname(mddev), b,
2079 (unsigned long long)r10_bio->sector);
2080 raid_end_bio_io(r10_bio);
2081 bio_put(bio);
2082 return;
2083 }
2084
2085 do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
2086 if (bio)
2087 bio_put(bio);
2088 slot = r10_bio->read_slot;
2089 rdev = conf->mirrors[mirror].rdev;
2090 printk_ratelimited(
2091 KERN_ERR
2092 "md/raid10:%s: %s: redirecting"
2093 "sector %llu to another mirror\n",
2094 mdname(mddev),
2095 bdevname(rdev->bdev, b),
2096 (unsigned long long)r10_bio->sector);
2097 bio = bio_clone_mddev(r10_bio->master_bio,
2098 GFP_NOIO, mddev);
2099 md_trim_bio(bio,
2100 r10_bio->sector - bio->bi_sector,
2101 max_sectors);
2102 r10_bio->devs[slot].bio = bio;
2103 bio->bi_sector = r10_bio->devs[slot].addr
2104 + rdev->data_offset;
2105 bio->bi_bdev = rdev->bdev;
2106 bio->bi_rw = READ | do_sync;
2107 bio->bi_private = r10_bio;
2108 bio->bi_end_io = raid10_end_read_request;
2109 if (max_sectors < r10_bio->sectors) {
2110 /* Drat - have to split this up more */
2111 struct bio *mbio = r10_bio->master_bio;
2112 int sectors_handled =
2113 r10_bio->sector + max_sectors
2114 - mbio->bi_sector;
2115 r10_bio->sectors = max_sectors;
2116 spin_lock_irq(&conf->device_lock);
2117 if (mbio->bi_phys_segments == 0)
2118 mbio->bi_phys_segments = 2;
2119 else
2120 mbio->bi_phys_segments++;
2121 spin_unlock_irq(&conf->device_lock);
2122 generic_make_request(bio);
2123 bio = NULL;
2124
2125 r10_bio = mempool_alloc(conf->r10bio_pool,
2126 GFP_NOIO);
2127 r10_bio->master_bio = mbio;
2128 r10_bio->sectors = (mbio->bi_size >> 9)
2129 - sectors_handled;
2130 r10_bio->state = 0;
2131 set_bit(R10BIO_ReadError,
2132 &r10_bio->state);
2133 r10_bio->mddev = mddev;
2134 r10_bio->sector = mbio->bi_sector
2135 + sectors_handled;
2136
2137 goto read_more;
2138 } else
2139 generic_make_request(bio);
2140}
2141
2142static void handle_write_completed(conf_t *conf, r10bio_t *r10_bio)
2143{
2144 /* Some sort of write request has finished and it
2145 * succeeded in writing where we thought there was a
2146 * bad block. So forget the bad block.
2147 * Or possibly if failed and we need to record
2148 * a bad block.
2149 */
2150 int m;
2151 mdk_rdev_t *rdev;
2152
2153 if (test_bit(R10BIO_IsSync, &r10_bio->state) ||
2154 test_bit(R10BIO_IsRecover, &r10_bio->state)) {
2155 for (m = 0; m < conf->copies; m++) {
2156 int dev = r10_bio->devs[m].devnum;
2157 rdev = conf->mirrors[dev].rdev;
2158 if (r10_bio->devs[m].bio == NULL)
2159 continue;
2160 if (test_bit(BIO_UPTODATE,
2161 &r10_bio->devs[m].bio->bi_flags)) {
2162 rdev_clear_badblocks(
2163 rdev,
2164 r10_bio->devs[m].addr,
2165 r10_bio->sectors);
2166 } else {
2167 if (!rdev_set_badblocks(
2168 rdev,
2169 r10_bio->devs[m].addr,
2170 r10_bio->sectors, 0))
2171 md_error(conf->mddev, rdev);
2172 }
2173 }
2174 put_buf(r10_bio);
2175 } else {
2176 for (m = 0; m < conf->copies; m++) {
2177 int dev = r10_bio->devs[m].devnum;
2178 struct bio *bio = r10_bio->devs[m].bio;
2179 rdev = conf->mirrors[dev].rdev;
2180 if (bio == IO_MADE_GOOD) {
2181 rdev_clear_badblocks(
2182 rdev,
2183 r10_bio->devs[m].addr,
2184 r10_bio->sectors);
2185 rdev_dec_pending(rdev, conf->mddev);
2186 } else if (bio != NULL &&
2187 !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
2188 if (!narrow_write_error(r10_bio, m)) {
2189 md_error(conf->mddev, rdev);
2190 set_bit(R10BIO_Degraded,
2191 &r10_bio->state);
2192 }
2193 rdev_dec_pending(rdev, conf->mddev);
2194 }
2195 }
2196 if (test_bit(R10BIO_WriteError,
2197 &r10_bio->state))
2198 close_write(r10_bio);
2199 raid_end_bio_io(r10_bio);
2200 }
2201}
2202
1601static void raid10d(mddev_t *mddev) 2203static void raid10d(mddev_t *mddev)
1602{ 2204{
1603 r10bio_t *r10_bio; 2205 r10bio_t *r10_bio;
1604 struct bio *bio;
1605 unsigned long flags; 2206 unsigned long flags;
1606 conf_t *conf = mddev->private; 2207 conf_t *conf = mddev->private;
1607 struct list_head *head = &conf->retry_list; 2208 struct list_head *head = &conf->retry_list;
1608 mdk_rdev_t *rdev;
1609 struct blk_plug plug; 2209 struct blk_plug plug;
1610 2210
1611 md_check_recovery(mddev); 2211 md_check_recovery(mddev);
1612 2212
1613 blk_start_plug(&plug); 2213 blk_start_plug(&plug);
1614 for (;;) { 2214 for (;;) {
1615 char b[BDEVNAME_SIZE];
1616 2215
1617 flush_pending_writes(conf); 2216 flush_pending_writes(conf);
1618 2217
@@ -1628,64 +2227,26 @@ static void raid10d(mddev_t *mddev)
1628 2227
1629 mddev = r10_bio->mddev; 2228 mddev = r10_bio->mddev;
1630 conf = mddev->private; 2229 conf = mddev->private;
1631 if (test_bit(R10BIO_IsSync, &r10_bio->state)) 2230 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
2231 test_bit(R10BIO_WriteError, &r10_bio->state))
2232 handle_write_completed(conf, r10_bio);
2233 else if (test_bit(R10BIO_IsSync, &r10_bio->state))
1632 sync_request_write(mddev, r10_bio); 2234 sync_request_write(mddev, r10_bio);
1633 else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) 2235 else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
1634 recovery_request_write(mddev, r10_bio); 2236 recovery_request_write(mddev, r10_bio);
2237 else if (test_bit(R10BIO_ReadError, &r10_bio->state))
2238 handle_read_error(mddev, r10_bio);
1635 else { 2239 else {
1636 int slot = r10_bio->read_slot; 2240 /* just a partial read to be scheduled from a
1637 int mirror = r10_bio->devs[slot].devnum; 2241 * separate context
1638 /* we got a read error. Maybe the drive is bad. Maybe just
1639 * the block and we can fix it.
1640 * We freeze all other IO, and try reading the block from
1641 * other devices. When we find one, we re-write
1642 * and check it that fixes the read error.
1643 * This is all done synchronously while the array is
1644 * frozen.
1645 */ 2242 */
1646 if (mddev->ro == 0) { 2243 int slot = r10_bio->read_slot;
1647 freeze_array(conf); 2244 generic_make_request(r10_bio->devs[slot].bio);
1648 fix_read_error(conf, mddev, r10_bio);
1649 unfreeze_array(conf);
1650 }
1651 rdev_dec_pending(conf->mirrors[mirror].rdev, mddev);
1652
1653 bio = r10_bio->devs[slot].bio;
1654 r10_bio->devs[slot].bio =
1655 mddev->ro ? IO_BLOCKED : NULL;
1656 mirror = read_balance(conf, r10_bio);
1657 if (mirror == -1) {
1658 printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
1659 " read error for block %llu\n",
1660 mdname(mddev),
1661 bdevname(bio->bi_bdev,b),
1662 (unsigned long long)r10_bio->sector);
1663 raid_end_bio_io(r10_bio);
1664 bio_put(bio);
1665 } else {
1666 const unsigned long do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
1667 bio_put(bio);
1668 slot = r10_bio->read_slot;
1669 rdev = conf->mirrors[mirror].rdev;
1670 if (printk_ratelimit())
1671 printk(KERN_ERR "md/raid10:%s: %s: redirecting sector %llu to"
1672 " another mirror\n",
1673 mdname(mddev),
1674 bdevname(rdev->bdev,b),
1675 (unsigned long long)r10_bio->sector);
1676 bio = bio_clone_mddev(r10_bio->master_bio,
1677 GFP_NOIO, mddev);
1678 r10_bio->devs[slot].bio = bio;
1679 bio->bi_sector = r10_bio->devs[slot].addr
1680 + rdev->data_offset;
1681 bio->bi_bdev = rdev->bdev;
1682 bio->bi_rw = READ | do_sync;
1683 bio->bi_private = r10_bio;
1684 bio->bi_end_io = raid10_end_read_request;
1685 generic_make_request(bio);
1686 }
1687 } 2245 }
2246
1688 cond_resched(); 2247 cond_resched();
2248 if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
2249 md_check_recovery(mddev);
1689 } 2250 }
1690 blk_finish_plug(&plug); 2251 blk_finish_plug(&plug);
1691} 2252}
@@ -1746,7 +2307,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
1746 int i; 2307 int i;
1747 int max_sync; 2308 int max_sync;
1748 sector_t sync_blocks; 2309 sector_t sync_blocks;
1749
1750 sector_t sectors_skipped = 0; 2310 sector_t sectors_skipped = 0;
1751 int chunks_skipped = 0; 2311 int chunks_skipped = 0;
1752 2312
@@ -1828,7 +2388,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
1828 max_sync = RESYNC_PAGES << (PAGE_SHIFT-9); 2388 max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
1829 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 2389 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
1830 /* recovery... the complicated one */ 2390 /* recovery... the complicated one */
1831 int j, k; 2391 int j;
1832 r10_bio = NULL; 2392 r10_bio = NULL;
1833 2393
1834 for (i=0 ; i<conf->raid_disks; i++) { 2394 for (i=0 ; i<conf->raid_disks; i++) {
@@ -1836,6 +2396,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
1836 r10bio_t *rb2; 2396 r10bio_t *rb2;
1837 sector_t sect; 2397 sector_t sect;
1838 int must_sync; 2398 int must_sync;
2399 int any_working;
1839 2400
1840 if (conf->mirrors[i].rdev == NULL || 2401 if (conf->mirrors[i].rdev == NULL ||
1841 test_bit(In_sync, &conf->mirrors[i].rdev->flags)) 2402 test_bit(In_sync, &conf->mirrors[i].rdev->flags))
@@ -1887,19 +2448,42 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
1887 must_sync = bitmap_start_sync(mddev->bitmap, sect, 2448 must_sync = bitmap_start_sync(mddev->bitmap, sect,
1888 &sync_blocks, still_degraded); 2449 &sync_blocks, still_degraded);
1889 2450
2451 any_working = 0;
1890 for (j=0; j<conf->copies;j++) { 2452 for (j=0; j<conf->copies;j++) {
2453 int k;
1891 int d = r10_bio->devs[j].devnum; 2454 int d = r10_bio->devs[j].devnum;
2455 sector_t from_addr, to_addr;
2456 mdk_rdev_t *rdev;
2457 sector_t sector, first_bad;
2458 int bad_sectors;
1892 if (!conf->mirrors[d].rdev || 2459 if (!conf->mirrors[d].rdev ||
1893 !test_bit(In_sync, &conf->mirrors[d].rdev->flags)) 2460 !test_bit(In_sync, &conf->mirrors[d].rdev->flags))
1894 continue; 2461 continue;
1895 /* This is where we read from */ 2462 /* This is where we read from */
2463 any_working = 1;
2464 rdev = conf->mirrors[d].rdev;
2465 sector = r10_bio->devs[j].addr;
2466
2467 if (is_badblock(rdev, sector, max_sync,
2468 &first_bad, &bad_sectors)) {
2469 if (first_bad > sector)
2470 max_sync = first_bad - sector;
2471 else {
2472 bad_sectors -= (sector
2473 - first_bad);
2474 if (max_sync > bad_sectors)
2475 max_sync = bad_sectors;
2476 continue;
2477 }
2478 }
1896 bio = r10_bio->devs[0].bio; 2479 bio = r10_bio->devs[0].bio;
1897 bio->bi_next = biolist; 2480 bio->bi_next = biolist;
1898 biolist = bio; 2481 biolist = bio;
1899 bio->bi_private = r10_bio; 2482 bio->bi_private = r10_bio;
1900 bio->bi_end_io = end_sync_read; 2483 bio->bi_end_io = end_sync_read;
1901 bio->bi_rw = READ; 2484 bio->bi_rw = READ;
1902 bio->bi_sector = r10_bio->devs[j].addr + 2485 from_addr = r10_bio->devs[j].addr;
2486 bio->bi_sector = from_addr +
1903 conf->mirrors[d].rdev->data_offset; 2487 conf->mirrors[d].rdev->data_offset;
1904 bio->bi_bdev = conf->mirrors[d].rdev->bdev; 2488 bio->bi_bdev = conf->mirrors[d].rdev->bdev;
1905 atomic_inc(&conf->mirrors[d].rdev->nr_pending); 2489 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
@@ -1916,26 +2500,48 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
1916 bio->bi_private = r10_bio; 2500 bio->bi_private = r10_bio;
1917 bio->bi_end_io = end_sync_write; 2501 bio->bi_end_io = end_sync_write;
1918 bio->bi_rw = WRITE; 2502 bio->bi_rw = WRITE;
1919 bio->bi_sector = r10_bio->devs[k].addr + 2503 to_addr = r10_bio->devs[k].addr;
2504 bio->bi_sector = to_addr +
1920 conf->mirrors[i].rdev->data_offset; 2505 conf->mirrors[i].rdev->data_offset;
1921 bio->bi_bdev = conf->mirrors[i].rdev->bdev; 2506 bio->bi_bdev = conf->mirrors[i].rdev->bdev;
1922 2507
1923 r10_bio->devs[0].devnum = d; 2508 r10_bio->devs[0].devnum = d;
2509 r10_bio->devs[0].addr = from_addr;
1924 r10_bio->devs[1].devnum = i; 2510 r10_bio->devs[1].devnum = i;
2511 r10_bio->devs[1].addr = to_addr;
1925 2512
1926 break; 2513 break;
1927 } 2514 }
1928 if (j == conf->copies) { 2515 if (j == conf->copies) {
1929 /* Cannot recover, so abort the recovery */ 2516 /* Cannot recover, so abort the recovery or
2517 * record a bad block */
1930 put_buf(r10_bio); 2518 put_buf(r10_bio);
1931 if (rb2) 2519 if (rb2)
1932 atomic_dec(&rb2->remaining); 2520 atomic_dec(&rb2->remaining);
1933 r10_bio = rb2; 2521 r10_bio = rb2;
1934 if (!test_and_set_bit(MD_RECOVERY_INTR, 2522 if (any_working) {
1935 &mddev->recovery)) 2523 /* problem is that there are bad blocks
1936 printk(KERN_INFO "md/raid10:%s: insufficient " 2524 * on other device(s)
1937 "working devices for recovery.\n", 2525 */
1938 mdname(mddev)); 2526 int k;
2527 for (k = 0; k < conf->copies; k++)
2528 if (r10_bio->devs[k].devnum == i)
2529 break;
2530 if (!rdev_set_badblocks(
2531 conf->mirrors[i].rdev,
2532 r10_bio->devs[k].addr,
2533 max_sync, 0))
2534 any_working = 0;
2535 }
2536 if (!any_working) {
2537 if (!test_and_set_bit(MD_RECOVERY_INTR,
2538 &mddev->recovery))
2539 printk(KERN_INFO "md/raid10:%s: insufficient "
2540 "working devices for recovery.\n",
2541 mdname(mddev));
2542 conf->mirrors[i].recovery_disabled
2543 = mddev->recovery_disabled;
2544 }
1939 break; 2545 break;
1940 } 2546 }
1941 } 2547 }
@@ -1979,12 +2585,28 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
1979 2585
1980 for (i=0; i<conf->copies; i++) { 2586 for (i=0; i<conf->copies; i++) {
1981 int d = r10_bio->devs[i].devnum; 2587 int d = r10_bio->devs[i].devnum;
2588 sector_t first_bad, sector;
2589 int bad_sectors;
2590
1982 bio = r10_bio->devs[i].bio; 2591 bio = r10_bio->devs[i].bio;
1983 bio->bi_end_io = NULL; 2592 bio->bi_end_io = NULL;
1984 clear_bit(BIO_UPTODATE, &bio->bi_flags); 2593 clear_bit(BIO_UPTODATE, &bio->bi_flags);
1985 if (conf->mirrors[d].rdev == NULL || 2594 if (conf->mirrors[d].rdev == NULL ||
1986 test_bit(Faulty, &conf->mirrors[d].rdev->flags)) 2595 test_bit(Faulty, &conf->mirrors[d].rdev->flags))
1987 continue; 2596 continue;
2597 sector = r10_bio->devs[i].addr;
2598 if (is_badblock(conf->mirrors[d].rdev,
2599 sector, max_sync,
2600 &first_bad, &bad_sectors)) {
2601 if (first_bad > sector)
2602 max_sync = first_bad - sector;
2603 else {
2604 bad_sectors -= (sector - first_bad);
2605 if (max_sync > bad_sectors)
2606 max_sync = max_sync;
2607 continue;
2608 }
2609 }
1988 atomic_inc(&conf->mirrors[d].rdev->nr_pending); 2610 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1989 atomic_inc(&r10_bio->remaining); 2611 atomic_inc(&r10_bio->remaining);
1990 bio->bi_next = biolist; 2612 bio->bi_next = biolist;
@@ -1992,7 +2614,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
1992 bio->bi_private = r10_bio; 2614 bio->bi_private = r10_bio;
1993 bio->bi_end_io = end_sync_read; 2615 bio->bi_end_io = end_sync_read;
1994 bio->bi_rw = READ; 2616 bio->bi_rw = READ;
1995 bio->bi_sector = r10_bio->devs[i].addr + 2617 bio->bi_sector = sector +
1996 conf->mirrors[d].rdev->data_offset; 2618 conf->mirrors[d].rdev->data_offset;
1997 bio->bi_bdev = conf->mirrors[d].rdev->bdev; 2619 bio->bi_bdev = conf->mirrors[d].rdev->bdev;
1998 count++; 2620 count++;
@@ -2079,7 +2701,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
2079 return sectors_skipped + nr_sectors; 2701 return sectors_skipped + nr_sectors;
2080 giveup: 2702 giveup:
2081 /* There is nowhere to write, so all non-sync 2703 /* There is nowhere to write, so all non-sync
2082 * drives must be failed, so try the next chunk... 2704 * drives must be failed or in resync, all drives
2705 * have a bad block, so try the next chunk...
2083 */ 2706 */
2084 if (sector_nr + max_sync < max_sector) 2707 if (sector_nr + max_sync < max_sector)
2085 max_sector = sector_nr + max_sync; 2708 max_sector = sector_nr + max_sync;
@@ -2249,6 +2872,7 @@ static int run(mddev_t *mddev)
2249 (conf->raid_disks / conf->near_copies)); 2872 (conf->raid_disks / conf->near_copies));
2250 2873
2251 list_for_each_entry(rdev, &mddev->disks, same_set) { 2874 list_for_each_entry(rdev, &mddev->disks, same_set) {
2875
2252 disk_idx = rdev->raid_disk; 2876 disk_idx = rdev->raid_disk;
2253 if (disk_idx >= conf->raid_disks 2877 if (disk_idx >= conf->raid_disks
2254 || disk_idx < 0) 2878 || disk_idx < 0)
@@ -2271,7 +2895,7 @@ static int run(mddev_t *mddev)
2271 disk->head_position = 0; 2895 disk->head_position = 0;
2272 } 2896 }
2273 /* need to check that every block has at least one working mirror */ 2897 /* need to check that every block has at least one working mirror */
2274 if (!enough(conf)) { 2898 if (!enough(conf, -1)) {
2275 printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n", 2899 printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n",
2276 mdname(mddev)); 2900 mdname(mddev));
2277 goto out_free_conf; 2901 goto out_free_conf;
@@ -2331,7 +2955,7 @@ static int run(mddev_t *mddev)
2331 return 0; 2955 return 0;
2332 2956
2333out_free_conf: 2957out_free_conf:
2334 md_unregister_thread(mddev->thread); 2958 md_unregister_thread(&mddev->thread);
2335 if (conf->r10bio_pool) 2959 if (conf->r10bio_pool)
2336 mempool_destroy(conf->r10bio_pool); 2960 mempool_destroy(conf->r10bio_pool);
2337 safe_put_page(conf->tmppage); 2961 safe_put_page(conf->tmppage);
@@ -2349,8 +2973,7 @@ static int stop(mddev_t *mddev)
2349 raise_barrier(conf, 0); 2973 raise_barrier(conf, 0);
2350 lower_barrier(conf); 2974 lower_barrier(conf);
2351 2975
2352 md_unregister_thread(mddev->thread); 2976 md_unregister_thread(&mddev->thread);
2353 mddev->thread = NULL;
2354 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 2977 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
2355 if (conf->r10bio_pool) 2978 if (conf->r10bio_pool)
2356 mempool_destroy(conf->r10bio_pool); 2979 mempool_destroy(conf->r10bio_pool);
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 944b1104d3b..79cb52a0d4a 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -6,6 +6,11 @@ typedef struct mirror_info mirror_info_t;
6struct mirror_info { 6struct mirror_info {
7 mdk_rdev_t *rdev; 7 mdk_rdev_t *rdev;
8 sector_t head_position; 8 sector_t head_position;
9 int recovery_disabled; /* matches
10 * mddev->recovery_disabled
11 * when we shouldn't try
12 * recovering this device.
13 */
9}; 14};
10 15
11typedef struct r10bio_s r10bio_t; 16typedef struct r10bio_s r10bio_t;
@@ -113,10 +118,26 @@ struct r10bio_s {
113 * level, we store IO_BLOCKED in the appropriate 'bios' pointer 118 * level, we store IO_BLOCKED in the appropriate 'bios' pointer
114 */ 119 */
115#define IO_BLOCKED ((struct bio*)1) 120#define IO_BLOCKED ((struct bio*)1)
121/* When we successfully write to a known bad-block, we need to remove the
122 * bad-block marking which must be done from process context. So we record
123 * the success by setting devs[n].bio to IO_MADE_GOOD
124 */
125#define IO_MADE_GOOD ((struct bio *)2)
126
127#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
116 128
117/* bits for r10bio.state */ 129/* bits for r10bio.state */
118#define R10BIO_Uptodate 0 130#define R10BIO_Uptodate 0
119#define R10BIO_IsSync 1 131#define R10BIO_IsSync 1
120#define R10BIO_IsRecover 2 132#define R10BIO_IsRecover 2
121#define R10BIO_Degraded 3 133#define R10BIO_Degraded 3
134/* Set ReadError on bios that experience a read error
135 * so that raid10d knows what to do with them.
136 */
137#define R10BIO_ReadError 4
138/* If a write for this request means we can clear some
139 * known-bad-block records, we set this flag.
140 */
141#define R10BIO_MadeGood 5
142#define R10BIO_WriteError 6
122#endif 143#endif
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b72edf35ec5..b6200c3935c 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -51,6 +51,7 @@
51#include <linux/seq_file.h> 51#include <linux/seq_file.h>
52#include <linux/cpu.h> 52#include <linux/cpu.h>
53#include <linux/slab.h> 53#include <linux/slab.h>
54#include <linux/ratelimit.h>
54#include "md.h" 55#include "md.h"
55#include "raid5.h" 56#include "raid5.h"
56#include "raid0.h" 57#include "raid0.h"
@@ -96,8 +97,6 @@
96#define __inline__ 97#define __inline__
97#endif 98#endif
98 99
99#define printk_rl(args...) ((void) (printk_ratelimit() && printk(args)))
100
101/* 100/*
102 * We maintain a biased count of active stripes in the bottom 16 bits of 101 * We maintain a biased count of active stripes in the bottom 16 bits of
103 * bi_phys_segments, and a count of processed stripes in the upper 16 bits 102 * bi_phys_segments, and a count of processed stripes in the upper 16 bits
@@ -341,7 +340,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
341 (unsigned long long)sh->sector, i, dev->toread, 340 (unsigned long long)sh->sector, i, dev->toread,
342 dev->read, dev->towrite, dev->written, 341 dev->read, dev->towrite, dev->written,
343 test_bit(R5_LOCKED, &dev->flags)); 342 test_bit(R5_LOCKED, &dev->flags));
344 BUG(); 343 WARN_ON(1);
345 } 344 }
346 dev->flags = 0; 345 dev->flags = 0;
347 raid5_build_block(sh, i, previous); 346 raid5_build_block(sh, i, previous);
@@ -527,6 +526,36 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
527 atomic_inc(&rdev->nr_pending); 526 atomic_inc(&rdev->nr_pending);
528 rcu_read_unlock(); 527 rcu_read_unlock();
529 528
529 /* We have already checked bad blocks for reads. Now
530 * need to check for writes.
531 */
532 while ((rw & WRITE) && rdev &&
533 test_bit(WriteErrorSeen, &rdev->flags)) {
534 sector_t first_bad;
535 int bad_sectors;
536 int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
537 &first_bad, &bad_sectors);
538 if (!bad)
539 break;
540
541 if (bad < 0) {
542 set_bit(BlockedBadBlocks, &rdev->flags);
543 if (!conf->mddev->external &&
544 conf->mddev->flags) {
545 /* It is very unlikely, but we might
546 * still need to write out the
547 * bad block log - better give it
548 * a chance*/
549 md_check_recovery(conf->mddev);
550 }
551 md_wait_for_blocked_rdev(rdev, conf->mddev);
552 } else {
553 /* Acknowledged bad block - skip the write */
554 rdev_dec_pending(rdev, conf->mddev);
555 rdev = NULL;
556 }
557 }
558
530 if (rdev) { 559 if (rdev) {
531 if (s->syncing || s->expanding || s->expanded) 560 if (s->syncing || s->expanding || s->expanded)
532 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 561 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
@@ -548,10 +577,6 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
548 bi->bi_io_vec[0].bv_offset = 0; 577 bi->bi_io_vec[0].bv_offset = 0;
549 bi->bi_size = STRIPE_SIZE; 578 bi->bi_size = STRIPE_SIZE;
550 bi->bi_next = NULL; 579 bi->bi_next = NULL;
551 if ((rw & WRITE) &&
552 test_bit(R5_ReWrite, &sh->dev[i].flags))
553 atomic_add(STRIPE_SECTORS,
554 &rdev->corrected_errors);
555 generic_make_request(bi); 580 generic_make_request(bi);
556 } else { 581 } else {
557 if (rw & WRITE) 582 if (rw & WRITE)
@@ -1020,12 +1045,12 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1020 if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { 1045 if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) {
1021 struct bio *wbi; 1046 struct bio *wbi;
1022 1047
1023 spin_lock(&sh->lock); 1048 spin_lock_irq(&sh->raid_conf->device_lock);
1024 chosen = dev->towrite; 1049 chosen = dev->towrite;
1025 dev->towrite = NULL; 1050 dev->towrite = NULL;
1026 BUG_ON(dev->written); 1051 BUG_ON(dev->written);
1027 wbi = dev->written = chosen; 1052 wbi = dev->written = chosen;
1028 spin_unlock(&sh->lock); 1053 spin_unlock_irq(&sh->raid_conf->device_lock);
1029 1054
1030 while (wbi && wbi->bi_sector < 1055 while (wbi && wbi->bi_sector <
1031 dev->sector + STRIPE_SECTORS) { 1056 dev->sector + STRIPE_SECTORS) {
@@ -1315,12 +1340,11 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
1315static int grow_one_stripe(raid5_conf_t *conf) 1340static int grow_one_stripe(raid5_conf_t *conf)
1316{ 1341{
1317 struct stripe_head *sh; 1342 struct stripe_head *sh;
1318 sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL); 1343 sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL);
1319 if (!sh) 1344 if (!sh)
1320 return 0; 1345 return 0;
1321 memset(sh, 0, sizeof(*sh) + (conf->pool_size-1)*sizeof(struct r5dev)); 1346
1322 sh->raid_conf = conf; 1347 sh->raid_conf = conf;
1323 spin_lock_init(&sh->lock);
1324 #ifdef CONFIG_MULTICORE_RAID456 1348 #ifdef CONFIG_MULTICORE_RAID456
1325 init_waitqueue_head(&sh->ops.wait_for_ops); 1349 init_waitqueue_head(&sh->ops.wait_for_ops);
1326 #endif 1350 #endif
@@ -1435,14 +1459,11 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
1435 return -ENOMEM; 1459 return -ENOMEM;
1436 1460
1437 for (i = conf->max_nr_stripes; i; i--) { 1461 for (i = conf->max_nr_stripes; i; i--) {
1438 nsh = kmem_cache_alloc(sc, GFP_KERNEL); 1462 nsh = kmem_cache_zalloc(sc, GFP_KERNEL);
1439 if (!nsh) 1463 if (!nsh)
1440 break; 1464 break;
1441 1465
1442 memset(nsh, 0, sizeof(*nsh) + (newsize-1)*sizeof(struct r5dev));
1443
1444 nsh->raid_conf = conf; 1466 nsh->raid_conf = conf;
1445 spin_lock_init(&nsh->lock);
1446 #ifdef CONFIG_MULTICORE_RAID456 1467 #ifdef CONFIG_MULTICORE_RAID456
1447 init_waitqueue_head(&nsh->ops.wait_for_ops); 1468 init_waitqueue_head(&nsh->ops.wait_for_ops);
1448 #endif 1469 #endif
@@ -1587,12 +1608,15 @@ static void raid5_end_read_request(struct bio * bi, int error)
1587 set_bit(R5_UPTODATE, &sh->dev[i].flags); 1608 set_bit(R5_UPTODATE, &sh->dev[i].flags);
1588 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 1609 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
1589 rdev = conf->disks[i].rdev; 1610 rdev = conf->disks[i].rdev;
1590 printk_rl(KERN_INFO "md/raid:%s: read error corrected" 1611 printk_ratelimited(
1591 " (%lu sectors at %llu on %s)\n", 1612 KERN_INFO
1592 mdname(conf->mddev), STRIPE_SECTORS, 1613 "md/raid:%s: read error corrected"
1593 (unsigned long long)(sh->sector 1614 " (%lu sectors at %llu on %s)\n",
1594 + rdev->data_offset), 1615 mdname(conf->mddev), STRIPE_SECTORS,
1595 bdevname(rdev->bdev, b)); 1616 (unsigned long long)(sh->sector
1617 + rdev->data_offset),
1618 bdevname(rdev->bdev, b));
1619 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
1596 clear_bit(R5_ReadError, &sh->dev[i].flags); 1620 clear_bit(R5_ReadError, &sh->dev[i].flags);
1597 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1621 clear_bit(R5_ReWrite, &sh->dev[i].flags);
1598 } 1622 }
@@ -1606,22 +1630,24 @@ static void raid5_end_read_request(struct bio * bi, int error)
1606 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 1630 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
1607 atomic_inc(&rdev->read_errors); 1631 atomic_inc(&rdev->read_errors);
1608 if (conf->mddev->degraded >= conf->max_degraded) 1632 if (conf->mddev->degraded >= conf->max_degraded)
1609 printk_rl(KERN_WARNING 1633 printk_ratelimited(
1610 "md/raid:%s: read error not correctable " 1634 KERN_WARNING
1611 "(sector %llu on %s).\n", 1635 "md/raid:%s: read error not correctable "
1612 mdname(conf->mddev), 1636 "(sector %llu on %s).\n",
1613 (unsigned long long)(sh->sector 1637 mdname(conf->mddev),
1614 + rdev->data_offset), 1638 (unsigned long long)(sh->sector
1615 bdn); 1639 + rdev->data_offset),
1640 bdn);
1616 else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) 1641 else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
1617 /* Oh, no!!! */ 1642 /* Oh, no!!! */
1618 printk_rl(KERN_WARNING 1643 printk_ratelimited(
1619 "md/raid:%s: read error NOT corrected!! " 1644 KERN_WARNING
1620 "(sector %llu on %s).\n", 1645 "md/raid:%s: read error NOT corrected!! "
1621 mdname(conf->mddev), 1646 "(sector %llu on %s).\n",
1622 (unsigned long long)(sh->sector 1647 mdname(conf->mddev),
1623 + rdev->data_offset), 1648 (unsigned long long)(sh->sector
1624 bdn); 1649 + rdev->data_offset),
1650 bdn);
1625 else if (atomic_read(&rdev->read_errors) 1651 else if (atomic_read(&rdev->read_errors)
1626 > conf->max_nr_stripes) 1652 > conf->max_nr_stripes)
1627 printk(KERN_WARNING 1653 printk(KERN_WARNING
@@ -1649,6 +1675,8 @@ static void raid5_end_write_request(struct bio *bi, int error)
1649 raid5_conf_t *conf = sh->raid_conf; 1675 raid5_conf_t *conf = sh->raid_conf;
1650 int disks = sh->disks, i; 1676 int disks = sh->disks, i;
1651 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1677 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
1678 sector_t first_bad;
1679 int bad_sectors;
1652 1680
1653 for (i=0 ; i<disks; i++) 1681 for (i=0 ; i<disks; i++)
1654 if (bi == &sh->dev[i].req) 1682 if (bi == &sh->dev[i].req)
@@ -1662,8 +1690,12 @@ static void raid5_end_write_request(struct bio *bi, int error)
1662 return; 1690 return;
1663 } 1691 }
1664 1692
1665 if (!uptodate) 1693 if (!uptodate) {
1666 md_error(conf->mddev, conf->disks[i].rdev); 1694 set_bit(WriteErrorSeen, &conf->disks[i].rdev->flags);
1695 set_bit(R5_WriteError, &sh->dev[i].flags);
1696 } else if (is_badblock(conf->disks[i].rdev, sh->sector, STRIPE_SECTORS,
1697 &first_bad, &bad_sectors))
1698 set_bit(R5_MadeGood, &sh->dev[i].flags);
1667 1699
1668 rdev_dec_pending(conf->disks[i].rdev, conf->mddev); 1700 rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
1669 1701
@@ -1710,6 +1742,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1710 */ 1742 */
1711 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 1743 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1712 } 1744 }
1745 set_bit(Blocked, &rdev->flags);
1713 set_bit(Faulty, &rdev->flags); 1746 set_bit(Faulty, &rdev->flags);
1714 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1747 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1715 printk(KERN_ALERT 1748 printk(KERN_ALERT
@@ -1760,7 +1793,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
1760 /* 1793 /*
1761 * Select the parity disk based on the user selected algorithm. 1794 * Select the parity disk based on the user selected algorithm.
1762 */ 1795 */
1763 pd_idx = qd_idx = ~0; 1796 pd_idx = qd_idx = -1;
1764 switch(conf->level) { 1797 switch(conf->level) {
1765 case 4: 1798 case 4:
1766 pd_idx = data_disks; 1799 pd_idx = data_disks;
@@ -2143,12 +2176,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
2143 raid5_conf_t *conf = sh->raid_conf; 2176 raid5_conf_t *conf = sh->raid_conf;
2144 int firstwrite=0; 2177 int firstwrite=0;
2145 2178
2146 pr_debug("adding bh b#%llu to stripe s#%llu\n", 2179 pr_debug("adding bi b#%llu to stripe s#%llu\n",
2147 (unsigned long long)bi->bi_sector, 2180 (unsigned long long)bi->bi_sector,
2148 (unsigned long long)sh->sector); 2181 (unsigned long long)sh->sector);
2149 2182
2150 2183
2151 spin_lock(&sh->lock);
2152 spin_lock_irq(&conf->device_lock); 2184 spin_lock_irq(&conf->device_lock);
2153 if (forwrite) { 2185 if (forwrite) {
2154 bip = &sh->dev[dd_idx].towrite; 2186 bip = &sh->dev[dd_idx].towrite;
@@ -2169,19 +2201,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
2169 bi->bi_next = *bip; 2201 bi->bi_next = *bip;
2170 *bip = bi; 2202 *bip = bi;
2171 bi->bi_phys_segments++; 2203 bi->bi_phys_segments++;
2172 spin_unlock_irq(&conf->device_lock);
2173 spin_unlock(&sh->lock);
2174
2175 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
2176 (unsigned long long)bi->bi_sector,
2177 (unsigned long long)sh->sector, dd_idx);
2178
2179 if (conf->mddev->bitmap && firstwrite) {
2180 bitmap_startwrite(conf->mddev->bitmap, sh->sector,
2181 STRIPE_SECTORS, 0);
2182 sh->bm_seq = conf->seq_flush+1;
2183 set_bit(STRIPE_BIT_DELAY, &sh->state);
2184 }
2185 2204
2186 if (forwrite) { 2205 if (forwrite) {
2187 /* check if page is covered */ 2206 /* check if page is covered */
@@ -2196,12 +2215,23 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
2196 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) 2215 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
2197 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); 2216 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
2198 } 2217 }
2218 spin_unlock_irq(&conf->device_lock);
2219
2220 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
2221 (unsigned long long)(*bip)->bi_sector,
2222 (unsigned long long)sh->sector, dd_idx);
2223
2224 if (conf->mddev->bitmap && firstwrite) {
2225 bitmap_startwrite(conf->mddev->bitmap, sh->sector,
2226 STRIPE_SECTORS, 0);
2227 sh->bm_seq = conf->seq_flush+1;
2228 set_bit(STRIPE_BIT_DELAY, &sh->state);
2229 }
2199 return 1; 2230 return 1;
2200 2231
2201 overlap: 2232 overlap:
2202 set_bit(R5_Overlap, &sh->dev[dd_idx].flags); 2233 set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
2203 spin_unlock_irq(&conf->device_lock); 2234 spin_unlock_irq(&conf->device_lock);
2204 spin_unlock(&sh->lock);
2205 return 0; 2235 return 0;
2206} 2236}
2207 2237
@@ -2238,9 +2268,18 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh,
2238 rcu_read_lock(); 2268 rcu_read_lock();
2239 rdev = rcu_dereference(conf->disks[i].rdev); 2269 rdev = rcu_dereference(conf->disks[i].rdev);
2240 if (rdev && test_bit(In_sync, &rdev->flags)) 2270 if (rdev && test_bit(In_sync, &rdev->flags))
2241 /* multiple read failures in one stripe */ 2271 atomic_inc(&rdev->nr_pending);
2242 md_error(conf->mddev, rdev); 2272 else
2273 rdev = NULL;
2243 rcu_read_unlock(); 2274 rcu_read_unlock();
2275 if (rdev) {
2276 if (!rdev_set_badblocks(
2277 rdev,
2278 sh->sector,
2279 STRIPE_SECTORS, 0))
2280 md_error(conf->mddev, rdev);
2281 rdev_dec_pending(rdev, conf->mddev);
2282 }
2244 } 2283 }
2245 spin_lock_irq(&conf->device_lock); 2284 spin_lock_irq(&conf->device_lock);
2246 /* fail all writes first */ 2285 /* fail all writes first */
@@ -2308,6 +2347,10 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh,
2308 if (bitmap_end) 2347 if (bitmap_end)
2309 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 2348 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
2310 STRIPE_SECTORS, 0, 0); 2349 STRIPE_SECTORS, 0, 0);
2350 /* If we were in the middle of a write the parity block might
2351 * still be locked - so just clear all R5_LOCKED flags
2352 */
2353 clear_bit(R5_LOCKED, &sh->dev[i].flags);
2311 } 2354 }
2312 2355
2313 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 2356 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
@@ -2315,109 +2358,73 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh,
2315 md_wakeup_thread(conf->mddev->thread); 2358 md_wakeup_thread(conf->mddev->thread);
2316} 2359}
2317 2360
2318/* fetch_block5 - checks the given member device to see if its data needs 2361static void
2319 * to be read or computed to satisfy a request. 2362handle_failed_sync(raid5_conf_t *conf, struct stripe_head *sh,
2320 * 2363 struct stripe_head_state *s)
2321 * Returns 1 when no more member devices need to be checked, otherwise returns
2322 * 0 to tell the loop in handle_stripe_fill5 to continue
2323 */
2324static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s,
2325 int disk_idx, int disks)
2326{
2327 struct r5dev *dev = &sh->dev[disk_idx];
2328 struct r5dev *failed_dev = &sh->dev[s->failed_num];
2329
2330 /* is the data in this block needed, and can we get it? */
2331 if (!test_bit(R5_LOCKED, &dev->flags) &&
2332 !test_bit(R5_UPTODATE, &dev->flags) &&
2333 (dev->toread ||
2334 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
2335 s->syncing || s->expanding ||
2336 (s->failed &&
2337 (failed_dev->toread ||
2338 (failed_dev->towrite &&
2339 !test_bit(R5_OVERWRITE, &failed_dev->flags)))))) {
2340 /* We would like to get this block, possibly by computing it,
2341 * otherwise read it if the backing disk is insync
2342 */
2343 if ((s->uptodate == disks - 1) &&
2344 (s->failed && disk_idx == s->failed_num)) {
2345 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2346 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2347 set_bit(R5_Wantcompute, &dev->flags);
2348 sh->ops.target = disk_idx;
2349 sh->ops.target2 = -1;
2350 s->req_compute = 1;
2351 /* Careful: from this point on 'uptodate' is in the eye
2352 * of raid_run_ops which services 'compute' operations
2353 * before writes. R5_Wantcompute flags a block that will
2354 * be R5_UPTODATE by the time it is needed for a
2355 * subsequent operation.
2356 */
2357 s->uptodate++;
2358 return 1; /* uptodate + compute == disks */
2359 } else if (test_bit(R5_Insync, &dev->flags)) {
2360 set_bit(R5_LOCKED, &dev->flags);
2361 set_bit(R5_Wantread, &dev->flags);
2362 s->locked++;
2363 pr_debug("Reading block %d (sync=%d)\n", disk_idx,
2364 s->syncing);
2365 }
2366 }
2367
2368 return 0;
2369}
2370
2371/**
2372 * handle_stripe_fill5 - read or compute data to satisfy pending requests.
2373 */
2374static void handle_stripe_fill5(struct stripe_head *sh,
2375 struct stripe_head_state *s, int disks)
2376{ 2364{
2365 int abort = 0;
2377 int i; 2366 int i;
2378 2367
2379 /* look for blocks to read/compute, skip this if a compute 2368 md_done_sync(conf->mddev, STRIPE_SECTORS, 0);
2380 * is already in flight, or if the stripe contents are in the 2369 clear_bit(STRIPE_SYNCING, &sh->state);
2381 * midst of changing due to a write 2370 s->syncing = 0;
2371 /* There is nothing more to do for sync/check/repair.
2372 * For recover we need to record a bad block on all
2373 * non-sync devices, or abort the recovery
2382 */ 2374 */
2383 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && 2375 if (!test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery))
2384 !sh->reconstruct_state) 2376 return;
2385 for (i = disks; i--; ) 2377 /* During recovery devices cannot be removed, so locking and
2386 if (fetch_block5(sh, s, i, disks)) 2378 * refcounting of rdevs is not needed
2387 break; 2379 */
2388 set_bit(STRIPE_HANDLE, &sh->state); 2380 for (i = 0; i < conf->raid_disks; i++) {
2381 mdk_rdev_t *rdev = conf->disks[i].rdev;
2382 if (!rdev
2383 || test_bit(Faulty, &rdev->flags)
2384 || test_bit(In_sync, &rdev->flags))
2385 continue;
2386 if (!rdev_set_badblocks(rdev, sh->sector,
2387 STRIPE_SECTORS, 0))
2388 abort = 1;
2389 }
2390 if (abort) {
2391 conf->recovery_disabled = conf->mddev->recovery_disabled;
2392 set_bit(MD_RECOVERY_INTR, &conf->mddev->recovery);
2393 }
2389} 2394}
2390 2395
2391/* fetch_block6 - checks the given member device to see if its data needs 2396/* fetch_block - checks the given member device to see if its data needs
2392 * to be read or computed to satisfy a request. 2397 * to be read or computed to satisfy a request.
2393 * 2398 *
2394 * Returns 1 when no more member devices need to be checked, otherwise returns 2399 * Returns 1 when no more member devices need to be checked, otherwise returns
2395 * 0 to tell the loop in handle_stripe_fill6 to continue 2400 * 0 to tell the loop in handle_stripe_fill to continue
2396 */ 2401 */
2397static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s, 2402static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
2398 struct r6_state *r6s, int disk_idx, int disks) 2403 int disk_idx, int disks)
2399{ 2404{
2400 struct r5dev *dev = &sh->dev[disk_idx]; 2405 struct r5dev *dev = &sh->dev[disk_idx];
2401 struct r5dev *fdev[2] = { &sh->dev[r6s->failed_num[0]], 2406 struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]],
2402 &sh->dev[r6s->failed_num[1]] }; 2407 &sh->dev[s->failed_num[1]] };
2403 2408
2409 /* is the data in this block needed, and can we get it? */
2404 if (!test_bit(R5_LOCKED, &dev->flags) && 2410 if (!test_bit(R5_LOCKED, &dev->flags) &&
2405 !test_bit(R5_UPTODATE, &dev->flags) && 2411 !test_bit(R5_UPTODATE, &dev->flags) &&
2406 (dev->toread || 2412 (dev->toread ||
2407 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || 2413 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
2408 s->syncing || s->expanding || 2414 s->syncing || s->expanding ||
2409 (s->failed >= 1 && 2415 (s->failed >= 1 && fdev[0]->toread) ||
2410 (fdev[0]->toread || s->to_write)) || 2416 (s->failed >= 2 && fdev[1]->toread) ||
2411 (s->failed >= 2 && 2417 (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite &&
2412 (fdev[1]->toread || s->to_write)))) { 2418 !test_bit(R5_OVERWRITE, &fdev[0]->flags)) ||
2419 (sh->raid_conf->level == 6 && s->failed && s->to_write))) {
2413 /* we would like to get this block, possibly by computing it, 2420 /* we would like to get this block, possibly by computing it,
2414 * otherwise read it if the backing disk is insync 2421 * otherwise read it if the backing disk is insync
2415 */ 2422 */
2416 BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); 2423 BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
2417 BUG_ON(test_bit(R5_Wantread, &dev->flags)); 2424 BUG_ON(test_bit(R5_Wantread, &dev->flags));
2418 if ((s->uptodate == disks - 1) && 2425 if ((s->uptodate == disks - 1) &&
2419 (s->failed && (disk_idx == r6s->failed_num[0] || 2426 (s->failed && (disk_idx == s->failed_num[0] ||
2420 disk_idx == r6s->failed_num[1]))) { 2427 disk_idx == s->failed_num[1]))) {
2421 /* have disk failed, and we're requested to fetch it; 2428 /* have disk failed, and we're requested to fetch it;
2422 * do compute it 2429 * do compute it
2423 */ 2430 */
@@ -2429,6 +2436,12 @@ static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s,
2429 sh->ops.target = disk_idx; 2436 sh->ops.target = disk_idx;
2430 sh->ops.target2 = -1; /* no 2nd target */ 2437 sh->ops.target2 = -1; /* no 2nd target */
2431 s->req_compute = 1; 2438 s->req_compute = 1;
2439 /* Careful: from this point on 'uptodate' is in the eye
2440 * of raid_run_ops which services 'compute' operations
2441 * before writes. R5_Wantcompute flags a block that will
2442 * be R5_UPTODATE by the time it is needed for a
2443 * subsequent operation.
2444 */
2432 s->uptodate++; 2445 s->uptodate++;
2433 return 1; 2446 return 1;
2434 } else if (s->uptodate == disks-2 && s->failed >= 2) { 2447 } else if (s->uptodate == disks-2 && s->failed >= 2) {
@@ -2469,11 +2482,11 @@ static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s,
2469} 2482}
2470 2483
2471/** 2484/**
2472 * handle_stripe_fill6 - read or compute data to satisfy pending requests. 2485 * handle_stripe_fill - read or compute data to satisfy pending requests.
2473 */ 2486 */
2474static void handle_stripe_fill6(struct stripe_head *sh, 2487static void handle_stripe_fill(struct stripe_head *sh,
2475 struct stripe_head_state *s, struct r6_state *r6s, 2488 struct stripe_head_state *s,
2476 int disks) 2489 int disks)
2477{ 2490{
2478 int i; 2491 int i;
2479 2492
@@ -2484,7 +2497,7 @@ static void handle_stripe_fill6(struct stripe_head *sh,
2484 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && 2497 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
2485 !sh->reconstruct_state) 2498 !sh->reconstruct_state)
2486 for (i = disks; i--; ) 2499 for (i = disks; i--; )
2487 if (fetch_block6(sh, s, r6s, i, disks)) 2500 if (fetch_block(sh, s, i, disks))
2488 break; 2501 break;
2489 set_bit(STRIPE_HANDLE, &sh->state); 2502 set_bit(STRIPE_HANDLE, &sh->state);
2490} 2503}
@@ -2540,11 +2553,19 @@ static void handle_stripe_clean_event(raid5_conf_t *conf,
2540 md_wakeup_thread(conf->mddev->thread); 2553 md_wakeup_thread(conf->mddev->thread);
2541} 2554}
2542 2555
2543static void handle_stripe_dirtying5(raid5_conf_t *conf, 2556static void handle_stripe_dirtying(raid5_conf_t *conf,
2544 struct stripe_head *sh, struct stripe_head_state *s, int disks) 2557 struct stripe_head *sh,
2558 struct stripe_head_state *s,
2559 int disks)
2545{ 2560{
2546 int rmw = 0, rcw = 0, i; 2561 int rmw = 0, rcw = 0, i;
2547 for (i = disks; i--; ) { 2562 if (conf->max_degraded == 2) {
2563 /* RAID6 requires 'rcw' in current implementation
2564 * Calculate the real rcw later - for now fake it
2565 * look like rcw is cheaper
2566 */
2567 rcw = 1; rmw = 2;
2568 } else for (i = disks; i--; ) {
2548 /* would I have to read this buffer for read_modify_write */ 2569 /* would I have to read this buffer for read_modify_write */
2549 struct r5dev *dev = &sh->dev[i]; 2570 struct r5dev *dev = &sh->dev[i];
2550 if ((dev->towrite || i == sh->pd_idx) && 2571 if ((dev->towrite || i == sh->pd_idx) &&
@@ -2591,16 +2612,19 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf,
2591 } 2612 }
2592 } 2613 }
2593 } 2614 }
2594 if (rcw <= rmw && rcw > 0) 2615 if (rcw <= rmw && rcw > 0) {
2595 /* want reconstruct write, but need to get some data */ 2616 /* want reconstruct write, but need to get some data */
2617 rcw = 0;
2596 for (i = disks; i--; ) { 2618 for (i = disks; i--; ) {
2597 struct r5dev *dev = &sh->dev[i]; 2619 struct r5dev *dev = &sh->dev[i];
2598 if (!test_bit(R5_OVERWRITE, &dev->flags) && 2620 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
2599 i != sh->pd_idx && 2621 i != sh->pd_idx && i != sh->qd_idx &&
2600 !test_bit(R5_LOCKED, &dev->flags) && 2622 !test_bit(R5_LOCKED, &dev->flags) &&
2601 !(test_bit(R5_UPTODATE, &dev->flags) || 2623 !(test_bit(R5_UPTODATE, &dev->flags) ||
2602 test_bit(R5_Wantcompute, &dev->flags)) && 2624 test_bit(R5_Wantcompute, &dev->flags))) {
2603 test_bit(R5_Insync, &dev->flags)) { 2625 rcw++;
2626 if (!test_bit(R5_Insync, &dev->flags))
2627 continue; /* it's a failed drive */
2604 if ( 2628 if (
2605 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2629 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2606 pr_debug("Read_old block " 2630 pr_debug("Read_old block "
@@ -2614,6 +2638,7 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf,
2614 } 2638 }
2615 } 2639 }
2616 } 2640 }
2641 }
2617 /* now if nothing is locked, and if we have enough data, 2642 /* now if nothing is locked, and if we have enough data,
2618 * we can start a write request 2643 * we can start a write request
2619 */ 2644 */
@@ -2630,53 +2655,6 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf,
2630 schedule_reconstruction(sh, s, rcw == 0, 0); 2655 schedule_reconstruction(sh, s, rcw == 0, 0);
2631} 2656}
2632 2657
2633static void handle_stripe_dirtying6(raid5_conf_t *conf,
2634 struct stripe_head *sh, struct stripe_head_state *s,
2635 struct r6_state *r6s, int disks)
2636{
2637 int rcw = 0, pd_idx = sh->pd_idx, i;
2638 int qd_idx = sh->qd_idx;
2639
2640 set_bit(STRIPE_HANDLE, &sh->state);
2641 for (i = disks; i--; ) {
2642 struct r5dev *dev = &sh->dev[i];
2643 /* check if we haven't enough data */
2644 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
2645 i != pd_idx && i != qd_idx &&
2646 !test_bit(R5_LOCKED, &dev->flags) &&
2647 !(test_bit(R5_UPTODATE, &dev->flags) ||
2648 test_bit(R5_Wantcompute, &dev->flags))) {
2649 rcw++;
2650 if (!test_bit(R5_Insync, &dev->flags))
2651 continue; /* it's a failed drive */
2652
2653 if (
2654 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2655 pr_debug("Read_old stripe %llu "
2656 "block %d for Reconstruct\n",
2657 (unsigned long long)sh->sector, i);
2658 set_bit(R5_LOCKED, &dev->flags);
2659 set_bit(R5_Wantread, &dev->flags);
2660 s->locked++;
2661 } else {
2662 pr_debug("Request delayed stripe %llu "
2663 "block %d for Reconstruct\n",
2664 (unsigned long long)sh->sector, i);
2665 set_bit(STRIPE_DELAYED, &sh->state);
2666 set_bit(STRIPE_HANDLE, &sh->state);
2667 }
2668 }
2669 }
2670 /* now if nothing is locked, and if we have enough data, we can start a
2671 * write request
2672 */
2673 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
2674 s->locked == 0 && rcw == 0 &&
2675 !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
2676 schedule_reconstruction(sh, s, 1, 0);
2677 }
2678}
2679
2680static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, 2658static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
2681 struct stripe_head_state *s, int disks) 2659 struct stripe_head_state *s, int disks)
2682{ 2660{
@@ -2695,7 +2673,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
2695 s->uptodate--; 2673 s->uptodate--;
2696 break; 2674 break;
2697 } 2675 }
2698 dev = &sh->dev[s->failed_num]; 2676 dev = &sh->dev[s->failed_num[0]];
2699 /* fall through */ 2677 /* fall through */
2700 case check_state_compute_result: 2678 case check_state_compute_result:
2701 sh->check_state = check_state_idle; 2679 sh->check_state = check_state_idle;
@@ -2767,7 +2745,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
2767 2745
2768static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, 2746static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
2769 struct stripe_head_state *s, 2747 struct stripe_head_state *s,
2770 struct r6_state *r6s, int disks) 2748 int disks)
2771{ 2749{
2772 int pd_idx = sh->pd_idx; 2750 int pd_idx = sh->pd_idx;
2773 int qd_idx = sh->qd_idx; 2751 int qd_idx = sh->qd_idx;
@@ -2786,14 +2764,14 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
2786 switch (sh->check_state) { 2764 switch (sh->check_state) {
2787 case check_state_idle: 2765 case check_state_idle:
2788 /* start a new check operation if there are < 2 failures */ 2766 /* start a new check operation if there are < 2 failures */
2789 if (s->failed == r6s->q_failed) { 2767 if (s->failed == s->q_failed) {
2790 /* The only possible failed device holds Q, so it 2768 /* The only possible failed device holds Q, so it
2791 * makes sense to check P (If anything else were failed, 2769 * makes sense to check P (If anything else were failed,
2792 * we would have used P to recreate it). 2770 * we would have used P to recreate it).
2793 */ 2771 */
2794 sh->check_state = check_state_run; 2772 sh->check_state = check_state_run;
2795 } 2773 }
2796 if (!r6s->q_failed && s->failed < 2) { 2774 if (!s->q_failed && s->failed < 2) {
2797 /* Q is not failed, and we didn't use it to generate 2775 /* Q is not failed, and we didn't use it to generate
2798 * anything, so it makes sense to check it 2776 * anything, so it makes sense to check it
2799 */ 2777 */
@@ -2835,13 +2813,13 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
2835 */ 2813 */
2836 BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ 2814 BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */
2837 if (s->failed == 2) { 2815 if (s->failed == 2) {
2838 dev = &sh->dev[r6s->failed_num[1]]; 2816 dev = &sh->dev[s->failed_num[1]];
2839 s->locked++; 2817 s->locked++;
2840 set_bit(R5_LOCKED, &dev->flags); 2818 set_bit(R5_LOCKED, &dev->flags);
2841 set_bit(R5_Wantwrite, &dev->flags); 2819 set_bit(R5_Wantwrite, &dev->flags);
2842 } 2820 }
2843 if (s->failed >= 1) { 2821 if (s->failed >= 1) {
2844 dev = &sh->dev[r6s->failed_num[0]]; 2822 dev = &sh->dev[s->failed_num[0]];
2845 s->locked++; 2823 s->locked++;
2846 set_bit(R5_LOCKED, &dev->flags); 2824 set_bit(R5_LOCKED, &dev->flags);
2847 set_bit(R5_Wantwrite, &dev->flags); 2825 set_bit(R5_Wantwrite, &dev->flags);
@@ -2928,8 +2906,7 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
2928 } 2906 }
2929} 2907}
2930 2908
2931static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, 2909static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh)
2932 struct r6_state *r6s)
2933{ 2910{
2934 int i; 2911 int i;
2935 2912
@@ -2971,7 +2948,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
2971 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); 2948 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
2972 for (j = 0; j < conf->raid_disks; j++) 2949 for (j = 0; j < conf->raid_disks; j++)
2973 if (j != sh2->pd_idx && 2950 if (j != sh2->pd_idx &&
2974 (!r6s || j != sh2->qd_idx) && 2951 j != sh2->qd_idx &&
2975 !test_bit(R5_Expanded, &sh2->dev[j].flags)) 2952 !test_bit(R5_Expanded, &sh2->dev[j].flags))
2976 break; 2953 break;
2977 if (j == conf->raid_disks) { 2954 if (j == conf->raid_disks) {
@@ -3006,43 +2983,35 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
3006 * 2983 *
3007 */ 2984 */
3008 2985
3009static void handle_stripe5(struct stripe_head *sh) 2986static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
3010{ 2987{
3011 raid5_conf_t *conf = sh->raid_conf; 2988 raid5_conf_t *conf = sh->raid_conf;
3012 int disks = sh->disks, i; 2989 int disks = sh->disks;
3013 struct bio *return_bi = NULL;
3014 struct stripe_head_state s;
3015 struct r5dev *dev; 2990 struct r5dev *dev;
3016 mdk_rdev_t *blocked_rdev = NULL; 2991 int i;
3017 int prexor;
3018 int dec_preread_active = 0;
3019 2992
3020 memset(&s, 0, sizeof(s)); 2993 memset(s, 0, sizeof(*s));
3021 pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d "
3022 "reconstruct:%d\n", (unsigned long long)sh->sector, sh->state,
3023 atomic_read(&sh->count), sh->pd_idx, sh->check_state,
3024 sh->reconstruct_state);
3025 2994
3026 spin_lock(&sh->lock); 2995 s->syncing = test_bit(STRIPE_SYNCING, &sh->state);
3027 clear_bit(STRIPE_HANDLE, &sh->state); 2996 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
3028 clear_bit(STRIPE_DELAYED, &sh->state); 2997 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
3029 2998 s->failed_num[0] = -1;
3030 s.syncing = test_bit(STRIPE_SYNCING, &sh->state); 2999 s->failed_num[1] = -1;
3031 s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
3032 s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
3033 3000
3034 /* Now to look around and see what can be done */ 3001 /* Now to look around and see what can be done */
3035 rcu_read_lock(); 3002 rcu_read_lock();
3003 spin_lock_irq(&conf->device_lock);
3036 for (i=disks; i--; ) { 3004 for (i=disks; i--; ) {
3037 mdk_rdev_t *rdev; 3005 mdk_rdev_t *rdev;
3006 sector_t first_bad;
3007 int bad_sectors;
3008 int is_bad = 0;
3038 3009
3039 dev = &sh->dev[i]; 3010 dev = &sh->dev[i];
3040 3011
3041 pr_debug("check %d: state 0x%lx toread %p read %p write %p " 3012 pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
3042 "written %p\n", i, dev->flags, dev->toread, dev->read, 3013 i, dev->flags, dev->toread, dev->towrite, dev->written);
3043 dev->towrite, dev->written); 3014 /* maybe we can reply to a read
3044
3045 /* maybe we can request a biofill operation
3046 * 3015 *
3047 * new wantfill requests are only permitted while 3016 * new wantfill requests are only permitted while
3048 * ops_complete_biofill is guaranteed to be inactive 3017 * ops_complete_biofill is guaranteed to be inactive
@@ -3052,37 +3021,74 @@ static void handle_stripe5(struct stripe_head *sh)
3052 set_bit(R5_Wantfill, &dev->flags); 3021 set_bit(R5_Wantfill, &dev->flags);
3053 3022
3054 /* now count some things */ 3023 /* now count some things */
3055 if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; 3024 if (test_bit(R5_LOCKED, &dev->flags))
3056 if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; 3025 s->locked++;
3057 if (test_bit(R5_Wantcompute, &dev->flags)) s.compute++; 3026 if (test_bit(R5_UPTODATE, &dev->flags))
3027 s->uptodate++;
3028 if (test_bit(R5_Wantcompute, &dev->flags)) {
3029 s->compute++;
3030 BUG_ON(s->compute > 2);
3031 }
3058 3032
3059 if (test_bit(R5_Wantfill, &dev->flags)) 3033 if (test_bit(R5_Wantfill, &dev->flags))
3060 s.to_fill++; 3034 s->to_fill++;
3061 else if (dev->toread) 3035 else if (dev->toread)
3062 s.to_read++; 3036 s->to_read++;
3063 if (dev->towrite) { 3037 if (dev->towrite) {
3064 s.to_write++; 3038 s->to_write++;
3065 if (!test_bit(R5_OVERWRITE, &dev->flags)) 3039 if (!test_bit(R5_OVERWRITE, &dev->flags))
3066 s.non_overwrite++; 3040 s->non_overwrite++;
3067 } 3041 }
3068 if (dev->written) 3042 if (dev->written)
3069 s.written++; 3043 s->written++;
3070 rdev = rcu_dereference(conf->disks[i].rdev); 3044 rdev = rcu_dereference(conf->disks[i].rdev);
3071 if (blocked_rdev == NULL && 3045 if (rdev) {
3072 rdev && unlikely(test_bit(Blocked, &rdev->flags))) { 3046 is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
3073 blocked_rdev = rdev; 3047 &first_bad, &bad_sectors);
3074 atomic_inc(&rdev->nr_pending); 3048 if (s->blocked_rdev == NULL
3049 && (test_bit(Blocked, &rdev->flags)
3050 || is_bad < 0)) {
3051 if (is_bad < 0)
3052 set_bit(BlockedBadBlocks,
3053 &rdev->flags);
3054 s->blocked_rdev = rdev;
3055 atomic_inc(&rdev->nr_pending);
3056 }
3075 } 3057 }
3076 clear_bit(R5_Insync, &dev->flags); 3058 clear_bit(R5_Insync, &dev->flags);
3077 if (!rdev) 3059 if (!rdev)
3078 /* Not in-sync */; 3060 /* Not in-sync */;
3079 else if (test_bit(In_sync, &rdev->flags)) 3061 else if (is_bad) {
3062 /* also not in-sync */
3063 if (!test_bit(WriteErrorSeen, &rdev->flags)) {
3064 /* treat as in-sync, but with a read error
3065 * which we can now try to correct
3066 */
3067 set_bit(R5_Insync, &dev->flags);
3068 set_bit(R5_ReadError, &dev->flags);
3069 }
3070 } else if (test_bit(In_sync, &rdev->flags))
3080 set_bit(R5_Insync, &dev->flags); 3071 set_bit(R5_Insync, &dev->flags);
3081 else { 3072 else if (!test_bit(Faulty, &rdev->flags)) {
3082 /* could be in-sync depending on recovery/reshape status */ 3073 /* in sync if before recovery_offset */
3083 if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) 3074 if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
3084 set_bit(R5_Insync, &dev->flags); 3075 set_bit(R5_Insync, &dev->flags);
3085 } 3076 }
3077 if (test_bit(R5_WriteError, &dev->flags)) {
3078 clear_bit(R5_Insync, &dev->flags);
3079 if (!test_bit(Faulty, &rdev->flags)) {
3080 s->handle_bad_blocks = 1;
3081 atomic_inc(&rdev->nr_pending);
3082 } else
3083 clear_bit(R5_WriteError, &dev->flags);
3084 }
3085 if (test_bit(R5_MadeGood, &dev->flags)) {
3086 if (!test_bit(Faulty, &rdev->flags)) {
3087 s->handle_bad_blocks = 1;
3088 atomic_inc(&rdev->nr_pending);
3089 } else
3090 clear_bit(R5_MadeGood, &dev->flags);
3091 }
3086 if (!test_bit(R5_Insync, &dev->flags)) { 3092 if (!test_bit(R5_Insync, &dev->flags)) {
3087 /* The ReadError flag will just be confusing now */ 3093 /* The ReadError flag will just be confusing now */
3088 clear_bit(R5_ReadError, &dev->flags); 3094 clear_bit(R5_ReadError, &dev->flags);
@@ -3091,313 +3097,60 @@ static void handle_stripe5(struct stripe_head *sh)
3091 if (test_bit(R5_ReadError, &dev->flags)) 3097 if (test_bit(R5_ReadError, &dev->flags))
3092 clear_bit(R5_Insync, &dev->flags); 3098 clear_bit(R5_Insync, &dev->flags);
3093 if (!test_bit(R5_Insync, &dev->flags)) { 3099 if (!test_bit(R5_Insync, &dev->flags)) {
3094 s.failed++; 3100 if (s->failed < 2)
3095 s.failed_num = i; 3101 s->failed_num[s->failed] = i;
3102 s->failed++;
3096 } 3103 }
3097 } 3104 }
3105 spin_unlock_irq(&conf->device_lock);
3098 rcu_read_unlock(); 3106 rcu_read_unlock();
3099
3100 if (unlikely(blocked_rdev)) {
3101 if (s.syncing || s.expanding || s.expanded ||
3102 s.to_write || s.written) {
3103 set_bit(STRIPE_HANDLE, &sh->state);
3104 goto unlock;
3105 }
3106 /* There is nothing for the blocked_rdev to block */
3107 rdev_dec_pending(blocked_rdev, conf->mddev);
3108 blocked_rdev = NULL;
3109 }
3110
3111 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
3112 set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
3113 set_bit(STRIPE_BIOFILL_RUN, &sh->state);
3114 }
3115
3116 pr_debug("locked=%d uptodate=%d to_read=%d"
3117 " to_write=%d failed=%d failed_num=%d\n",
3118 s.locked, s.uptodate, s.to_read, s.to_write,
3119 s.failed, s.failed_num);
3120 /* check if the array has lost two devices and, if so, some requests might
3121 * need to be failed
3122 */
3123 if (s.failed > 1 && s.to_read+s.to_write+s.written)
3124 handle_failed_stripe(conf, sh, &s, disks, &return_bi);
3125 if (s.failed > 1 && s.syncing) {
3126 md_done_sync(conf->mddev, STRIPE_SECTORS,0);
3127 clear_bit(STRIPE_SYNCING, &sh->state);
3128 s.syncing = 0;
3129 }
3130
3131 /* might be able to return some write requests if the parity block
3132 * is safe, or on a failed drive
3133 */
3134 dev = &sh->dev[sh->pd_idx];
3135 if ( s.written &&
3136 ((test_bit(R5_Insync, &dev->flags) &&
3137 !test_bit(R5_LOCKED, &dev->flags) &&
3138 test_bit(R5_UPTODATE, &dev->flags)) ||
3139 (s.failed == 1 && s.failed_num == sh->pd_idx)))
3140 handle_stripe_clean_event(conf, sh, disks, &return_bi);
3141
3142 /* Now we might consider reading some blocks, either to check/generate
3143 * parity, or to satisfy requests
3144 * or to load a block that is being partially written.
3145 */
3146 if (s.to_read || s.non_overwrite ||
3147 (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding)
3148 handle_stripe_fill5(sh, &s, disks);
3149
3150 /* Now we check to see if any write operations have recently
3151 * completed
3152 */
3153 prexor = 0;
3154 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result)
3155 prexor = 1;
3156 if (sh->reconstruct_state == reconstruct_state_drain_result ||
3157 sh->reconstruct_state == reconstruct_state_prexor_drain_result) {
3158 sh->reconstruct_state = reconstruct_state_idle;
3159
3160 /* All the 'written' buffers and the parity block are ready to
3161 * be written back to disk
3162 */
3163 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
3164 for (i = disks; i--; ) {
3165 dev = &sh->dev[i];
3166 if (test_bit(R5_LOCKED, &dev->flags) &&
3167 (i == sh->pd_idx || dev->written)) {
3168 pr_debug("Writing block %d\n", i);
3169 set_bit(R5_Wantwrite, &dev->flags);
3170 if (prexor)
3171 continue;
3172 if (!test_bit(R5_Insync, &dev->flags) ||
3173 (i == sh->pd_idx && s.failed == 0))
3174 set_bit(STRIPE_INSYNC, &sh->state);
3175 }
3176 }
3177 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3178 dec_preread_active = 1;
3179 }
3180
3181 /* Now to consider new write requests and what else, if anything
3182 * should be read. We do not handle new writes when:
3183 * 1/ A 'write' operation (copy+xor) is already in flight.
3184 * 2/ A 'check' operation is in flight, as it may clobber the parity
3185 * block.
3186 */
3187 if (s.to_write && !sh->reconstruct_state && !sh->check_state)
3188 handle_stripe_dirtying5(conf, sh, &s, disks);
3189
3190 /* maybe we need to check and possibly fix the parity for this stripe
3191 * Any reads will already have been scheduled, so we just see if enough
3192 * data is available. The parity check is held off while parity
3193 * dependent operations are in flight.
3194 */
3195 if (sh->check_state ||
3196 (s.syncing && s.locked == 0 &&
3197 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
3198 !test_bit(STRIPE_INSYNC, &sh->state)))
3199 handle_parity_checks5(conf, sh, &s, disks);
3200
3201 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
3202 md_done_sync(conf->mddev, STRIPE_SECTORS,1);
3203 clear_bit(STRIPE_SYNCING, &sh->state);
3204 }
3205
3206 /* If the failed drive is just a ReadError, then we might need to progress
3207 * the repair/check process
3208 */
3209 if (s.failed == 1 && !conf->mddev->ro &&
3210 test_bit(R5_ReadError, &sh->dev[s.failed_num].flags)
3211 && !test_bit(R5_LOCKED, &sh->dev[s.failed_num].flags)
3212 && test_bit(R5_UPTODATE, &sh->dev[s.failed_num].flags)
3213 ) {
3214 dev = &sh->dev[s.failed_num];
3215 if (!test_bit(R5_ReWrite, &dev->flags)) {
3216 set_bit(R5_Wantwrite, &dev->flags);
3217 set_bit(R5_ReWrite, &dev->flags);
3218 set_bit(R5_LOCKED, &dev->flags);
3219 s.locked++;
3220 } else {
3221 /* let's read it back */
3222 set_bit(R5_Wantread, &dev->flags);
3223 set_bit(R5_LOCKED, &dev->flags);
3224 s.locked++;
3225 }
3226 }
3227
3228 /* Finish reconstruct operations initiated by the expansion process */
3229 if (sh->reconstruct_state == reconstruct_state_result) {
3230 struct stripe_head *sh2
3231 = get_active_stripe(conf, sh->sector, 1, 1, 1);
3232 if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) {
3233 /* sh cannot be written until sh2 has been read.
3234 * so arrange for sh to be delayed a little
3235 */
3236 set_bit(STRIPE_DELAYED, &sh->state);
3237 set_bit(STRIPE_HANDLE, &sh->state);
3238 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
3239 &sh2->state))
3240 atomic_inc(&conf->preread_active_stripes);
3241 release_stripe(sh2);
3242 goto unlock;
3243 }
3244 if (sh2)
3245 release_stripe(sh2);
3246
3247 sh->reconstruct_state = reconstruct_state_idle;
3248 clear_bit(STRIPE_EXPANDING, &sh->state);
3249 for (i = conf->raid_disks; i--; ) {
3250 set_bit(R5_Wantwrite, &sh->dev[i].flags);
3251 set_bit(R5_LOCKED, &sh->dev[i].flags);
3252 s.locked++;
3253 }
3254 }
3255
3256 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
3257 !sh->reconstruct_state) {
3258 /* Need to write out all blocks after computing parity */
3259 sh->disks = conf->raid_disks;
3260 stripe_set_idx(sh->sector, conf, 0, sh);
3261 schedule_reconstruction(sh, &s, 1, 1);
3262 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
3263 clear_bit(STRIPE_EXPAND_READY, &sh->state);
3264 atomic_dec(&conf->reshape_stripes);
3265 wake_up(&conf->wait_for_overlap);
3266 md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
3267 }
3268
3269 if (s.expanding && s.locked == 0 &&
3270 !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
3271 handle_stripe_expansion(conf, sh, NULL);
3272
3273 unlock:
3274 spin_unlock(&sh->lock);
3275
3276 /* wait for this device to become unblocked */
3277 if (unlikely(blocked_rdev))
3278 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
3279
3280 if (s.ops_request)
3281 raid_run_ops(sh, s.ops_request);
3282
3283 ops_run_io(sh, &s);
3284
3285 if (dec_preread_active) {
3286 /* We delay this until after ops_run_io so that if make_request
3287 * is waiting on a flush, it won't continue until the writes
3288 * have actually been submitted.
3289 */
3290 atomic_dec(&conf->preread_active_stripes);
3291 if (atomic_read(&conf->preread_active_stripes) <
3292 IO_THRESHOLD)
3293 md_wakeup_thread(conf->mddev->thread);
3294 }
3295 return_io(return_bi);
3296} 3107}
3297 3108
3298static void handle_stripe6(struct stripe_head *sh) 3109static void handle_stripe(struct stripe_head *sh)
3299{ 3110{
3111 struct stripe_head_state s;
3300 raid5_conf_t *conf = sh->raid_conf; 3112 raid5_conf_t *conf = sh->raid_conf;
3113 int i;
3114 int prexor;
3301 int disks = sh->disks; 3115 int disks = sh->disks;
3302 struct bio *return_bi = NULL; 3116 struct r5dev *pdev, *qdev;
3303 int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx; 3117
3304 struct stripe_head_state s; 3118 clear_bit(STRIPE_HANDLE, &sh->state);
3305 struct r6_state r6s; 3119 if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) {
3306 struct r5dev *dev, *pdev, *qdev; 3120 /* already being handled, ensure it gets handled
3307 mdk_rdev_t *blocked_rdev = NULL; 3121 * again when current action finishes */
3308 int dec_preread_active = 0; 3122 set_bit(STRIPE_HANDLE, &sh->state);
3123 return;
3124 }
3125
3126 if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
3127 set_bit(STRIPE_SYNCING, &sh->state);
3128 clear_bit(STRIPE_INSYNC, &sh->state);
3129 }
3130 clear_bit(STRIPE_DELAYED, &sh->state);
3309 3131
3310 pr_debug("handling stripe %llu, state=%#lx cnt=%d, " 3132 pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
3311 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", 3133 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
3312 (unsigned long long)sh->sector, sh->state, 3134 (unsigned long long)sh->sector, sh->state,
3313 atomic_read(&sh->count), pd_idx, qd_idx, 3135 atomic_read(&sh->count), sh->pd_idx, sh->qd_idx,
3314 sh->check_state, sh->reconstruct_state); 3136 sh->check_state, sh->reconstruct_state);
3315 memset(&s, 0, sizeof(s));
3316
3317 spin_lock(&sh->lock);
3318 clear_bit(STRIPE_HANDLE, &sh->state);
3319 clear_bit(STRIPE_DELAYED, &sh->state);
3320
3321 s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
3322 s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
3323 s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
3324 /* Now to look around and see what can be done */
3325 3137
3326 rcu_read_lock(); 3138 analyse_stripe(sh, &s);
3327 for (i=disks; i--; ) {
3328 mdk_rdev_t *rdev;
3329 dev = &sh->dev[i];
3330 3139
3331 pr_debug("check %d: state 0x%lx read %p write %p written %p\n", 3140 if (s.handle_bad_blocks) {
3332 i, dev->flags, dev->toread, dev->towrite, dev->written); 3141 set_bit(STRIPE_HANDLE, &sh->state);
3333 /* maybe we can reply to a read 3142 goto finish;
3334 *
3335 * new wantfill requests are only permitted while
3336 * ops_complete_biofill is guaranteed to be inactive
3337 */
3338 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
3339 !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
3340 set_bit(R5_Wantfill, &dev->flags);
3341
3342 /* now count some things */
3343 if (test_bit(R5_LOCKED, &dev->flags)) s.locked++;
3344 if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++;
3345 if (test_bit(R5_Wantcompute, &dev->flags)) {
3346 s.compute++;
3347 BUG_ON(s.compute > 2);
3348 }
3349
3350 if (test_bit(R5_Wantfill, &dev->flags)) {
3351 s.to_fill++;
3352 } else if (dev->toread)
3353 s.to_read++;
3354 if (dev->towrite) {
3355 s.to_write++;
3356 if (!test_bit(R5_OVERWRITE, &dev->flags))
3357 s.non_overwrite++;
3358 }
3359 if (dev->written)
3360 s.written++;
3361 rdev = rcu_dereference(conf->disks[i].rdev);
3362 if (blocked_rdev == NULL &&
3363 rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
3364 blocked_rdev = rdev;
3365 atomic_inc(&rdev->nr_pending);
3366 }
3367 clear_bit(R5_Insync, &dev->flags);
3368 if (!rdev)
3369 /* Not in-sync */;
3370 else if (test_bit(In_sync, &rdev->flags))
3371 set_bit(R5_Insync, &dev->flags);
3372 else {
3373 /* in sync if before recovery_offset */
3374 if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
3375 set_bit(R5_Insync, &dev->flags);
3376 }
3377 if (!test_bit(R5_Insync, &dev->flags)) {
3378 /* The ReadError flag will just be confusing now */
3379 clear_bit(R5_ReadError, &dev->flags);
3380 clear_bit(R5_ReWrite, &dev->flags);
3381 }
3382 if (test_bit(R5_ReadError, &dev->flags))
3383 clear_bit(R5_Insync, &dev->flags);
3384 if (!test_bit(R5_Insync, &dev->flags)) {
3385 if (s.failed < 2)
3386 r6s.failed_num[s.failed] = i;
3387 s.failed++;
3388 }
3389 } 3143 }
3390 rcu_read_unlock();
3391 3144
3392 if (unlikely(blocked_rdev)) { 3145 if (unlikely(s.blocked_rdev)) {
3393 if (s.syncing || s.expanding || s.expanded || 3146 if (s.syncing || s.expanding || s.expanded ||
3394 s.to_write || s.written) { 3147 s.to_write || s.written) {
3395 set_bit(STRIPE_HANDLE, &sh->state); 3148 set_bit(STRIPE_HANDLE, &sh->state);
3396 goto unlock; 3149 goto finish;
3397 } 3150 }
3398 /* There is nothing for the blocked_rdev to block */ 3151 /* There is nothing for the blocked_rdev to block */
3399 rdev_dec_pending(blocked_rdev, conf->mddev); 3152 rdev_dec_pending(s.blocked_rdev, conf->mddev);
3400 blocked_rdev = NULL; 3153 s.blocked_rdev = NULL;
3401 } 3154 }
3402 3155
3403 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { 3156 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
@@ -3408,83 +3161,92 @@ static void handle_stripe6(struct stripe_head *sh)
3408 pr_debug("locked=%d uptodate=%d to_read=%d" 3161 pr_debug("locked=%d uptodate=%d to_read=%d"
3409 " to_write=%d failed=%d failed_num=%d,%d\n", 3162 " to_write=%d failed=%d failed_num=%d,%d\n",
3410 s.locked, s.uptodate, s.to_read, s.to_write, s.failed, 3163 s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
3411 r6s.failed_num[0], r6s.failed_num[1]); 3164 s.failed_num[0], s.failed_num[1]);
3412 /* check if the array has lost >2 devices and, if so, some requests 3165 /* check if the array has lost more than max_degraded devices and,
3413 * might need to be failed 3166 * if so, some requests might need to be failed.
3414 */ 3167 */
3415 if (s.failed > 2 && s.to_read+s.to_write+s.written) 3168 if (s.failed > conf->max_degraded) {
3416 handle_failed_stripe(conf, sh, &s, disks, &return_bi); 3169 sh->check_state = 0;
3417 if (s.failed > 2 && s.syncing) { 3170 sh->reconstruct_state = 0;
3418 md_done_sync(conf->mddev, STRIPE_SECTORS,0); 3171 if (s.to_read+s.to_write+s.written)
3419 clear_bit(STRIPE_SYNCING, &sh->state); 3172 handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
3420 s.syncing = 0; 3173 if (s.syncing)
3174 handle_failed_sync(conf, sh, &s);
3421 } 3175 }
3422 3176
3423 /* 3177 /*
3424 * might be able to return some write requests if the parity blocks 3178 * might be able to return some write requests if the parity blocks
3425 * are safe, or on a failed drive 3179 * are safe, or on a failed drive
3426 */ 3180 */
3427 pdev = &sh->dev[pd_idx]; 3181 pdev = &sh->dev[sh->pd_idx];
3428 r6s.p_failed = (s.failed >= 1 && r6s.failed_num[0] == pd_idx) 3182 s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
3429 || (s.failed >= 2 && r6s.failed_num[1] == pd_idx); 3183 || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
3430 qdev = &sh->dev[qd_idx]; 3184 qdev = &sh->dev[sh->qd_idx];
3431 r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == qd_idx) 3185 s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
3432 || (s.failed >= 2 && r6s.failed_num[1] == qd_idx); 3186 || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
3433 3187 || conf->level < 6;
3434 if ( s.written && 3188
3435 ( r6s.p_failed || ((test_bit(R5_Insync, &pdev->flags) 3189 if (s.written &&
3190 (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
3436 && !test_bit(R5_LOCKED, &pdev->flags) 3191 && !test_bit(R5_LOCKED, &pdev->flags)
3437 && test_bit(R5_UPTODATE, &pdev->flags)))) && 3192 && test_bit(R5_UPTODATE, &pdev->flags)))) &&
3438 ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags) 3193 (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
3439 && !test_bit(R5_LOCKED, &qdev->flags) 3194 && !test_bit(R5_LOCKED, &qdev->flags)
3440 && test_bit(R5_UPTODATE, &qdev->flags))))) 3195 && test_bit(R5_UPTODATE, &qdev->flags)))))
3441 handle_stripe_clean_event(conf, sh, disks, &return_bi); 3196 handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
3442 3197
3443 /* Now we might consider reading some blocks, either to check/generate 3198 /* Now we might consider reading some blocks, either to check/generate
3444 * parity, or to satisfy requests 3199 * parity, or to satisfy requests
3445 * or to load a block that is being partially written. 3200 * or to load a block that is being partially written.
3446 */ 3201 */
3447 if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || 3202 if (s.to_read || s.non_overwrite
3448 (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) 3203 || (conf->level == 6 && s.to_write && s.failed)
3449 handle_stripe_fill6(sh, &s, &r6s, disks); 3204 || (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding)
3205 handle_stripe_fill(sh, &s, disks);
3450 3206
3451 /* Now we check to see if any write operations have recently 3207 /* Now we check to see if any write operations have recently
3452 * completed 3208 * completed
3453 */ 3209 */
3454 if (sh->reconstruct_state == reconstruct_state_drain_result) { 3210 prexor = 0;
3455 3211 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result)
3212 prexor = 1;
3213 if (sh->reconstruct_state == reconstruct_state_drain_result ||
3214 sh->reconstruct_state == reconstruct_state_prexor_drain_result) {
3456 sh->reconstruct_state = reconstruct_state_idle; 3215 sh->reconstruct_state = reconstruct_state_idle;
3457 /* All the 'written' buffers and the parity blocks are ready to 3216
3217 /* All the 'written' buffers and the parity block are ready to
3458 * be written back to disk 3218 * be written back to disk
3459 */ 3219 */
3460 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); 3220 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
3461 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags)); 3221 BUG_ON(sh->qd_idx >= 0 &&
3222 !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags));
3462 for (i = disks; i--; ) { 3223 for (i = disks; i--; ) {
3463 dev = &sh->dev[i]; 3224 struct r5dev *dev = &sh->dev[i];
3464 if (test_bit(R5_LOCKED, &dev->flags) && 3225 if (test_bit(R5_LOCKED, &dev->flags) &&
3465 (i == sh->pd_idx || i == qd_idx || 3226 (i == sh->pd_idx || i == sh->qd_idx ||
3466 dev->written)) { 3227 dev->written)) {
3467 pr_debug("Writing block %d\n", i); 3228 pr_debug("Writing block %d\n", i);
3468 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
3469 set_bit(R5_Wantwrite, &dev->flags); 3229 set_bit(R5_Wantwrite, &dev->flags);
3230 if (prexor)
3231 continue;
3470 if (!test_bit(R5_Insync, &dev->flags) || 3232 if (!test_bit(R5_Insync, &dev->flags) ||
3471 ((i == sh->pd_idx || i == qd_idx) && 3233 ((i == sh->pd_idx || i == sh->qd_idx) &&
3472 s.failed == 0)) 3234 s.failed == 0))
3473 set_bit(STRIPE_INSYNC, &sh->state); 3235 set_bit(STRIPE_INSYNC, &sh->state);
3474 } 3236 }
3475 } 3237 }
3476 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3238 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3477 dec_preread_active = 1; 3239 s.dec_preread_active = 1;
3478 } 3240 }
3479 3241
3480 /* Now to consider new write requests and what else, if anything 3242 /* Now to consider new write requests and what else, if anything
3481 * should be read. We do not handle new writes when: 3243 * should be read. We do not handle new writes when:
3482 * 1/ A 'write' operation (copy+gen_syndrome) is already in flight. 3244 * 1/ A 'write' operation (copy+xor) is already in flight.
3483 * 2/ A 'check' operation is in flight, as it may clobber the parity 3245 * 2/ A 'check' operation is in flight, as it may clobber the parity
3484 * block. 3246 * block.
3485 */ 3247 */
3486 if (s.to_write && !sh->reconstruct_state && !sh->check_state) 3248 if (s.to_write && !sh->reconstruct_state && !sh->check_state)
3487 handle_stripe_dirtying6(conf, sh, &s, &r6s, disks); 3249 handle_stripe_dirtying(conf, sh, &s, disks);
3488 3250
3489 /* maybe we need to check and possibly fix the parity for this stripe 3251 /* maybe we need to check and possibly fix the parity for this stripe
3490 * Any reads will already have been scheduled, so we just see if enough 3252 * Any reads will already have been scheduled, so we just see if enough
@@ -3494,20 +3256,24 @@ static void handle_stripe6(struct stripe_head *sh)
3494 if (sh->check_state || 3256 if (sh->check_state ||
3495 (s.syncing && s.locked == 0 && 3257 (s.syncing && s.locked == 0 &&
3496 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 3258 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
3497 !test_bit(STRIPE_INSYNC, &sh->state))) 3259 !test_bit(STRIPE_INSYNC, &sh->state))) {
3498 handle_parity_checks6(conf, sh, &s, &r6s, disks); 3260 if (conf->level == 6)
3261 handle_parity_checks6(conf, sh, &s, disks);
3262 else
3263 handle_parity_checks5(conf, sh, &s, disks);
3264 }
3499 3265
3500 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { 3266 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
3501 md_done_sync(conf->mddev, STRIPE_SECTORS,1); 3267 md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
3502 clear_bit(STRIPE_SYNCING, &sh->state); 3268 clear_bit(STRIPE_SYNCING, &sh->state);
3503 } 3269 }
3504 3270
3505 /* If the failed drives are just a ReadError, then we might need 3271 /* If the failed drives are just a ReadError, then we might need
3506 * to progress the repair/check process 3272 * to progress the repair/check process
3507 */ 3273 */
3508 if (s.failed <= 2 && !conf->mddev->ro) 3274 if (s.failed <= conf->max_degraded && !conf->mddev->ro)
3509 for (i = 0; i < s.failed; i++) { 3275 for (i = 0; i < s.failed; i++) {
3510 dev = &sh->dev[r6s.failed_num[i]]; 3276 struct r5dev *dev = &sh->dev[s.failed_num[i]];
3511 if (test_bit(R5_ReadError, &dev->flags) 3277 if (test_bit(R5_ReadError, &dev->flags)
3512 && !test_bit(R5_LOCKED, &dev->flags) 3278 && !test_bit(R5_LOCKED, &dev->flags)
3513 && test_bit(R5_UPTODATE, &dev->flags) 3279 && test_bit(R5_UPTODATE, &dev->flags)
@@ -3526,8 +3292,26 @@ static void handle_stripe6(struct stripe_head *sh)
3526 } 3292 }
3527 } 3293 }
3528 3294
3295
3529 /* Finish reconstruct operations initiated by the expansion process */ 3296 /* Finish reconstruct operations initiated by the expansion process */
3530 if (sh->reconstruct_state == reconstruct_state_result) { 3297 if (sh->reconstruct_state == reconstruct_state_result) {
3298 struct stripe_head *sh_src
3299 = get_active_stripe(conf, sh->sector, 1, 1, 1);
3300 if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) {
3301 /* sh cannot be written until sh_src has been read.
3302 * so arrange for sh to be delayed a little
3303 */
3304 set_bit(STRIPE_DELAYED, &sh->state);
3305 set_bit(STRIPE_HANDLE, &sh->state);
3306 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
3307 &sh_src->state))
3308 atomic_inc(&conf->preread_active_stripes);
3309 release_stripe(sh_src);
3310 goto finish;
3311 }
3312 if (sh_src)
3313 release_stripe(sh_src);
3314
3531 sh->reconstruct_state = reconstruct_state_idle; 3315 sh->reconstruct_state = reconstruct_state_idle;
3532 clear_bit(STRIPE_EXPANDING, &sh->state); 3316 clear_bit(STRIPE_EXPANDING, &sh->state);
3533 for (i = conf->raid_disks; i--; ) { 3317 for (i = conf->raid_disks; i--; ) {
@@ -3539,24 +3323,7 @@ static void handle_stripe6(struct stripe_head *sh)
3539 3323
3540 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && 3324 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
3541 !sh->reconstruct_state) { 3325 !sh->reconstruct_state) {
3542 struct stripe_head *sh2 3326 /* Need to write out all blocks after computing parity */
3543 = get_active_stripe(conf, sh->sector, 1, 1, 1);
3544 if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) {
3545 /* sh cannot be written until sh2 has been read.
3546 * so arrange for sh to be delayed a little
3547 */
3548 set_bit(STRIPE_DELAYED, &sh->state);
3549 set_bit(STRIPE_HANDLE, &sh->state);
3550 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
3551 &sh2->state))
3552 atomic_inc(&conf->preread_active_stripes);
3553 release_stripe(sh2);
3554 goto unlock;
3555 }
3556 if (sh2)
3557 release_stripe(sh2);
3558
3559 /* Need to write out all blocks after computing P&Q */
3560 sh->disks = conf->raid_disks; 3327 sh->disks = conf->raid_disks;
3561 stripe_set_idx(sh->sector, conf, 0, sh); 3328 stripe_set_idx(sh->sector, conf, 0, sh);
3562 schedule_reconstruction(sh, &s, 1, 1); 3329 schedule_reconstruction(sh, &s, 1, 1);
@@ -3569,22 +3336,39 @@ static void handle_stripe6(struct stripe_head *sh)
3569 3336
3570 if (s.expanding && s.locked == 0 && 3337 if (s.expanding && s.locked == 0 &&
3571 !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) 3338 !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
3572 handle_stripe_expansion(conf, sh, &r6s); 3339 handle_stripe_expansion(conf, sh);
3573
3574 unlock:
3575 spin_unlock(&sh->lock);
3576 3340
3341finish:
3577 /* wait for this device to become unblocked */ 3342 /* wait for this device to become unblocked */
3578 if (unlikely(blocked_rdev)) 3343 if (conf->mddev->external && unlikely(s.blocked_rdev))
3579 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); 3344 md_wait_for_blocked_rdev(s.blocked_rdev, conf->mddev);
3345
3346 if (s.handle_bad_blocks)
3347 for (i = disks; i--; ) {
3348 mdk_rdev_t *rdev;
3349 struct r5dev *dev = &sh->dev[i];
3350 if (test_and_clear_bit(R5_WriteError, &dev->flags)) {
3351 /* We own a safe reference to the rdev */
3352 rdev = conf->disks[i].rdev;
3353 if (!rdev_set_badblocks(rdev, sh->sector,
3354 STRIPE_SECTORS, 0))
3355 md_error(conf->mddev, rdev);
3356 rdev_dec_pending(rdev, conf->mddev);
3357 }
3358 if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
3359 rdev = conf->disks[i].rdev;
3360 rdev_clear_badblocks(rdev, sh->sector,
3361 STRIPE_SECTORS);
3362 rdev_dec_pending(rdev, conf->mddev);
3363 }
3364 }
3580 3365
3581 if (s.ops_request) 3366 if (s.ops_request)
3582 raid_run_ops(sh, s.ops_request); 3367 raid_run_ops(sh, s.ops_request);
3583 3368
3584 ops_run_io(sh, &s); 3369 ops_run_io(sh, &s);
3585 3370
3586 3371 if (s.dec_preread_active) {
3587 if (dec_preread_active) {
3588 /* We delay this until after ops_run_io so that if make_request 3372 /* We delay this until after ops_run_io so that if make_request
3589 * is waiting on a flush, it won't continue until the writes 3373 * is waiting on a flush, it won't continue until the writes
3590 * have actually been submitted. 3374 * have actually been submitted.
@@ -3595,15 +3379,9 @@ static void handle_stripe6(struct stripe_head *sh)
3595 md_wakeup_thread(conf->mddev->thread); 3379 md_wakeup_thread(conf->mddev->thread);
3596 } 3380 }
3597 3381
3598 return_io(return_bi); 3382 return_io(s.return_bi);
3599}
3600 3383
3601static void handle_stripe(struct stripe_head *sh) 3384 clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
3602{
3603 if (sh->raid_conf->level == 6)
3604 handle_stripe6(sh);
3605 else
3606 handle_stripe5(sh);
3607} 3385}
3608 3386
3609static void raid5_activate_delayed(raid5_conf_t *conf) 3387static void raid5_activate_delayed(raid5_conf_t *conf)
@@ -3833,6 +3611,9 @@ static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio)
3833 rcu_read_lock(); 3611 rcu_read_lock();
3834 rdev = rcu_dereference(conf->disks[dd_idx].rdev); 3612 rdev = rcu_dereference(conf->disks[dd_idx].rdev);
3835 if (rdev && test_bit(In_sync, &rdev->flags)) { 3613 if (rdev && test_bit(In_sync, &rdev->flags)) {
3614 sector_t first_bad;
3615 int bad_sectors;
3616
3836 atomic_inc(&rdev->nr_pending); 3617 atomic_inc(&rdev->nr_pending);
3837 rcu_read_unlock(); 3618 rcu_read_unlock();
3838 raid_bio->bi_next = (void*)rdev; 3619 raid_bio->bi_next = (void*)rdev;
@@ -3840,8 +3621,10 @@ static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio)
3840 align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); 3621 align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
3841 align_bi->bi_sector += rdev->data_offset; 3622 align_bi->bi_sector += rdev->data_offset;
3842 3623
3843 if (!bio_fits_rdev(align_bi)) { 3624 if (!bio_fits_rdev(align_bi) ||
3844 /* too big in some way */ 3625 is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9,
3626 &first_bad, &bad_sectors)) {
3627 /* too big in some way, or has a known bad block */
3845 bio_put(align_bi); 3628 bio_put(align_bi);
3846 rdev_dec_pending(rdev, mddev); 3629 rdev_dec_pending(rdev, mddev);
3847 return 0; 3630 return 0;
@@ -4016,7 +3799,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
4016 } 3799 }
4017 } 3800 }
4018 3801
4019 if (bio_data_dir(bi) == WRITE && 3802 if (rw == WRITE &&
4020 logical_sector >= mddev->suspend_lo && 3803 logical_sector >= mddev->suspend_lo &&
4021 logical_sector < mddev->suspend_hi) { 3804 logical_sector < mddev->suspend_hi) {
4022 release_stripe(sh); 3805 release_stripe(sh);
@@ -4034,7 +3817,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
4034 } 3817 }
4035 3818
4036 if (test_bit(STRIPE_EXPANDING, &sh->state) || 3819 if (test_bit(STRIPE_EXPANDING, &sh->state) ||
4037 !add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) { 3820 !add_stripe_bio(sh, bi, dd_idx, rw)) {
4038 /* Stripe is busy expanding or 3821 /* Stripe is busy expanding or
4039 * add failed due to overlap. Flush everything 3822 * add failed due to overlap. Flush everything
4040 * and wait a while 3823 * and wait a while
@@ -4375,10 +4158,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
4375 4158
4376 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); 4159 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
4377 4160
4378 spin_lock(&sh->lock); 4161 set_bit(STRIPE_SYNC_REQUESTED, &sh->state);
4379 set_bit(STRIPE_SYNCING, &sh->state);
4380 clear_bit(STRIPE_INSYNC, &sh->state);
4381 spin_unlock(&sh->lock);
4382 4162
4383 handle_stripe(sh); 4163 handle_stripe(sh);
4384 release_stripe(sh); 4164 release_stripe(sh);
@@ -4509,6 +4289,9 @@ static void raid5d(mddev_t *mddev)
4509 release_stripe(sh); 4289 release_stripe(sh);
4510 cond_resched(); 4290 cond_resched();
4511 4291
4292 if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
4293 md_check_recovery(mddev);
4294
4512 spin_lock_irq(&conf->device_lock); 4295 spin_lock_irq(&conf->device_lock);
4513 } 4296 }
4514 pr_debug("%d stripes handled\n", handled); 4297 pr_debug("%d stripes handled\n", handled);
@@ -5162,8 +4945,7 @@ static int run(mddev_t *mddev)
5162 4945
5163 return 0; 4946 return 0;
5164abort: 4947abort:
5165 md_unregister_thread(mddev->thread); 4948 md_unregister_thread(&mddev->thread);
5166 mddev->thread = NULL;
5167 if (conf) { 4949 if (conf) {
5168 print_raid5_conf(conf); 4950 print_raid5_conf(conf);
5169 free_conf(conf); 4951 free_conf(conf);
@@ -5177,8 +4959,7 @@ static int stop(mddev_t *mddev)
5177{ 4959{
5178 raid5_conf_t *conf = mddev->private; 4960 raid5_conf_t *conf = mddev->private;
5179 4961
5180 md_unregister_thread(mddev->thread); 4962 md_unregister_thread(&mddev->thread);
5181 mddev->thread = NULL;
5182 if (mddev->queue) 4963 if (mddev->queue)
5183 mddev->queue->backing_dev_info.congested_fn = NULL; 4964 mddev->queue->backing_dev_info.congested_fn = NULL;
5184 free_conf(conf); 4965 free_conf(conf);
@@ -5313,6 +5094,7 @@ static int raid5_remove_disk(mddev_t *mddev, int number)
5313 * isn't possible. 5094 * isn't possible.
5314 */ 5095 */
5315 if (!test_bit(Faulty, &rdev->flags) && 5096 if (!test_bit(Faulty, &rdev->flags) &&
5097 mddev->recovery_disabled != conf->recovery_disabled &&
5316 !has_failed(conf) && 5098 !has_failed(conf) &&
5317 number < conf->raid_disks) { 5099 number < conf->raid_disks) {
5318 err = -EBUSY; 5100 err = -EBUSY;
@@ -5341,6 +5123,9 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
5341 int first = 0; 5123 int first = 0;
5342 int last = conf->raid_disks - 1; 5124 int last = conf->raid_disks - 1;
5343 5125
5126 if (mddev->recovery_disabled == conf->recovery_disabled)
5127 return -EBUSY;
5128
5344 if (has_failed(conf)) 5129 if (has_failed(conf))
5345 /* no point adding a device */ 5130 /* no point adding a device */
5346 return -EINVAL; 5131 return -EINVAL;
@@ -5519,16 +5304,14 @@ static int raid5_start_reshape(mddev_t *mddev)
5519 if (rdev->raid_disk < 0 && 5304 if (rdev->raid_disk < 0 &&
5520 !test_bit(Faulty, &rdev->flags)) { 5305 !test_bit(Faulty, &rdev->flags)) {
5521 if (raid5_add_disk(mddev, rdev) == 0) { 5306 if (raid5_add_disk(mddev, rdev) == 0) {
5522 char nm[20];
5523 if (rdev->raid_disk 5307 if (rdev->raid_disk
5524 >= conf->previous_raid_disks) { 5308 >= conf->previous_raid_disks) {
5525 set_bit(In_sync, &rdev->flags); 5309 set_bit(In_sync, &rdev->flags);
5526 added_devices++; 5310 added_devices++;
5527 } else 5311 } else
5528 rdev->recovery_offset = 0; 5312 rdev->recovery_offset = 0;
5529 sprintf(nm, "rd%d", rdev->raid_disk); 5313
5530 if (sysfs_create_link(&mddev->kobj, 5314 if (sysfs_link_rdev(mddev, rdev))
5531 &rdev->kobj, nm))
5532 /* Failure here is OK */; 5315 /* Failure here is OK */;
5533 } 5316 }
5534 } else if (rdev->raid_disk >= conf->previous_raid_disks 5317 } else if (rdev->raid_disk >= conf->previous_raid_disks
@@ -5624,9 +5407,7 @@ static void raid5_finish_reshape(mddev_t *mddev)
5624 d++) { 5407 d++) {
5625 mdk_rdev_t *rdev = conf->disks[d].rdev; 5408 mdk_rdev_t *rdev = conf->disks[d].rdev;
5626 if (rdev && raid5_remove_disk(mddev, d) == 0) { 5409 if (rdev && raid5_remove_disk(mddev, d) == 0) {
5627 char nm[20]; 5410 sysfs_unlink_rdev(mddev, rdev);
5628 sprintf(nm, "rd%d", rdev->raid_disk);
5629 sysfs_remove_link(&mddev->kobj, nm);
5630 rdev->raid_disk = -1; 5411 rdev->raid_disk = -1;
5631 } 5412 }
5632 } 5413 }
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 3ca77a2613b..11b9566184b 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -6,11 +6,11 @@
6 6
7/* 7/*
8 * 8 *
9 * Each stripe contains one buffer per disc. Each buffer can be in 9 * Each stripe contains one buffer per device. Each buffer can be in
10 * one of a number of states stored in "flags". Changes between 10 * one of a number of states stored in "flags". Changes between
11 * these states happen *almost* exclusively under a per-stripe 11 * these states happen *almost* exclusively under the protection of the
12 * spinlock. Some very specific changes can happen in bi_end_io, and 12 * STRIPE_ACTIVE flag. Some very specific changes can happen in bi_end_io, and
13 * these are not protected by the spin lock. 13 * these are not protected by STRIPE_ACTIVE.
14 * 14 *
15 * The flag bits that are used to represent these states are: 15 * The flag bits that are used to represent these states are:
16 * R5_UPTODATE and R5_LOCKED 16 * R5_UPTODATE and R5_LOCKED
@@ -76,12 +76,10 @@
76 * block and the cached buffer are successfully written, any buffer on 76 * block and the cached buffer are successfully written, any buffer on
77 * a written list can be returned with b_end_io. 77 * a written list can be returned with b_end_io.
78 * 78 *
79 * The write list and read list both act as fifos. The read list is 79 * The write list and read list both act as fifos. The read list,
80 * protected by the device_lock. The write and written lists are 80 * write list and written list are protected by the device_lock.
81 * protected by the stripe lock. The device_lock, which can be 81 * The device_lock is only for list manipulations and will only be
82 * claimed while the stipe lock is held, is only for list 82 * held for a very short time. It can be claimed from interrupts.
83 * manipulations and will only be held for a very short time. It can
84 * be claimed from interrupts.
85 * 83 *
86 * 84 *
87 * Stripes in the stripe cache can be on one of two lists (or on 85 * Stripes in the stripe cache can be on one of two lists (or on
@@ -96,7 +94,6 @@
96 * 94 *
97 * The inactive_list, handle_list and hash bucket lists are all protected by the 95 * The inactive_list, handle_list and hash bucket lists are all protected by the
98 * device_lock. 96 * device_lock.
99 * - stripes on the inactive_list never have their stripe_lock held.
100 * - stripes have a reference counter. If count==0, they are on a list. 97 * - stripes have a reference counter. If count==0, they are on a list.
101 * - If a stripe might need handling, STRIPE_HANDLE is set. 98 * - If a stripe might need handling, STRIPE_HANDLE is set.
102 * - When refcount reaches zero, then if STRIPE_HANDLE it is put on 99 * - When refcount reaches zero, then if STRIPE_HANDLE it is put on
@@ -116,10 +113,10 @@
116 * attach a request to an active stripe (add_stripe_bh()) 113 * attach a request to an active stripe (add_stripe_bh())
117 * lockdev attach-buffer unlockdev 114 * lockdev attach-buffer unlockdev
118 * handle a stripe (handle_stripe()) 115 * handle a stripe (handle_stripe())
119 * lockstripe clrSTRIPE_HANDLE ... 116 * setSTRIPE_ACTIVE, clrSTRIPE_HANDLE ...
120 * (lockdev check-buffers unlockdev) .. 117 * (lockdev check-buffers unlockdev) ..
121 * change-state .. 118 * change-state ..
122 * record io/ops needed unlockstripe schedule io/ops 119 * record io/ops needed clearSTRIPE_ACTIVE schedule io/ops
123 * release an active stripe (release_stripe()) 120 * release an active stripe (release_stripe())
124 * lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev 121 * lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev
125 * 122 *
@@ -128,8 +125,7 @@
128 * on a cached buffer, and plus one if the stripe is undergoing stripe 125 * on a cached buffer, and plus one if the stripe is undergoing stripe
129 * operations. 126 * operations.
130 * 127 *
131 * Stripe operations are performed outside the stripe lock, 128 * The stripe operations are:
132 * the stripe operations are:
133 * -copying data between the stripe cache and user application buffers 129 * -copying data between the stripe cache and user application buffers
134 * -computing blocks to save a disk access, or to recover a missing block 130 * -computing blocks to save a disk access, or to recover a missing block
135 * -updating the parity on a write operation (reconstruct write and 131 * -updating the parity on a write operation (reconstruct write and
@@ -159,7 +155,8 @@
159 */ 155 */
160 156
161/* 157/*
162 * Operations state - intermediate states that are visible outside of sh->lock 158 * Operations state - intermediate states that are visible outside of
159 * STRIPE_ACTIVE.
163 * In general _idle indicates nothing is running, _run indicates a data 160 * In general _idle indicates nothing is running, _run indicates a data
164 * processing operation is active, and _result means the data processing result 161 * processing operation is active, and _result means the data processing result
165 * is stable and can be acted upon. For simple operations like biofill and 162 * is stable and can be acted upon. For simple operations like biofill and
@@ -209,7 +206,6 @@ struct stripe_head {
209 short ddf_layout;/* use DDF ordering to calculate Q */ 206 short ddf_layout;/* use DDF ordering to calculate Q */
210 unsigned long state; /* state flags */ 207 unsigned long state; /* state flags */
211 atomic_t count; /* nr of active thread/requests */ 208 atomic_t count; /* nr of active thread/requests */
212 spinlock_t lock;
213 int bm_seq; /* sequence number for bitmap flushes */ 209 int bm_seq; /* sequence number for bitmap flushes */
214 int disks; /* disks in stripe */ 210 int disks; /* disks in stripe */
215 enum check_states check_state; 211 enum check_states check_state;
@@ -240,19 +236,20 @@ struct stripe_head {
240}; 236};
241 237
242/* stripe_head_state - collects and tracks the dynamic state of a stripe_head 238/* stripe_head_state - collects and tracks the dynamic state of a stripe_head
243 * for handle_stripe. It is only valid under spin_lock(sh->lock); 239 * for handle_stripe.
244 */ 240 */
245struct stripe_head_state { 241struct stripe_head_state {
246 int syncing, expanding, expanded; 242 int syncing, expanding, expanded;
247 int locked, uptodate, to_read, to_write, failed, written; 243 int locked, uptodate, to_read, to_write, failed, written;
248 int to_fill, compute, req_compute, non_overwrite; 244 int to_fill, compute, req_compute, non_overwrite;
249 int failed_num; 245 int failed_num[2];
246 int p_failed, q_failed;
247 int dec_preread_active;
250 unsigned long ops_request; 248 unsigned long ops_request;
251};
252 249
253/* r6_state - extra state data only relevant to r6 */ 250 struct bio *return_bi;
254struct r6_state { 251 mdk_rdev_t *blocked_rdev;
255 int p_failed, q_failed, failed_num[2]; 252 int handle_bad_blocks;
256}; 253};
257 254
258/* Flags */ 255/* Flags */
@@ -268,14 +265,16 @@ struct r6_state {
268#define R5_ReWrite 9 /* have tried to over-write the readerror */ 265#define R5_ReWrite 9 /* have tried to over-write the readerror */
269 266
270#define R5_Expanded 10 /* This block now has post-expand data */ 267#define R5_Expanded 10 /* This block now has post-expand data */
271#define R5_Wantcompute 11 /* compute_block in progress treat as 268#define R5_Wantcompute 11 /* compute_block in progress treat as
272 * uptodate 269 * uptodate
273 */ 270 */
274#define R5_Wantfill 12 /* dev->toread contains a bio that needs 271#define R5_Wantfill 12 /* dev->toread contains a bio that needs
275 * filling 272 * filling
276 */ 273 */
277#define R5_Wantdrain 13 /* dev->towrite needs to be drained */ 274#define R5_Wantdrain 13 /* dev->towrite needs to be drained */
278#define R5_WantFUA 14 /* Write should be FUA */ 275#define R5_WantFUA 14 /* Write should be FUA */
276#define R5_WriteError 15 /* got a write error - need to record it */
277#define R5_MadeGood 16 /* A bad block has been fixed by writing to it*/
279/* 278/*
280 * Write method 279 * Write method
281 */ 280 */
@@ -289,21 +288,25 @@ struct r6_state {
289/* 288/*
290 * Stripe state 289 * Stripe state
291 */ 290 */
292#define STRIPE_HANDLE 2 291enum {
293#define STRIPE_SYNCING 3 292 STRIPE_ACTIVE,
294#define STRIPE_INSYNC 4 293 STRIPE_HANDLE,
295#define STRIPE_PREREAD_ACTIVE 5 294 STRIPE_SYNC_REQUESTED,
296#define STRIPE_DELAYED 6 295 STRIPE_SYNCING,
297#define STRIPE_DEGRADED 7 296 STRIPE_INSYNC,
298#define STRIPE_BIT_DELAY 8 297 STRIPE_PREREAD_ACTIVE,
299#define STRIPE_EXPANDING 9 298 STRIPE_DELAYED,
300#define STRIPE_EXPAND_SOURCE 10 299 STRIPE_DEGRADED,
301#define STRIPE_EXPAND_READY 11 300 STRIPE_BIT_DELAY,
302#define STRIPE_IO_STARTED 12 /* do not count towards 'bypass_count' */ 301 STRIPE_EXPANDING,
303#define STRIPE_FULL_WRITE 13 /* all blocks are set to be overwritten */ 302 STRIPE_EXPAND_SOURCE,
304#define STRIPE_BIOFILL_RUN 14 303 STRIPE_EXPAND_READY,
305#define STRIPE_COMPUTE_RUN 15 304 STRIPE_IO_STARTED, /* do not count towards 'bypass_count' */
306#define STRIPE_OPS_REQ_PENDING 16 305 STRIPE_FULL_WRITE, /* all blocks are set to be overwritten */
306 STRIPE_BIOFILL_RUN,
307 STRIPE_COMPUTE_RUN,
308 STRIPE_OPS_REQ_PENDING,
309};
307 310
308/* 311/*
309 * Operation request flags 312 * Operation request flags
@@ -336,7 +339,7 @@ struct r6_state {
336 * PREREAD_ACTIVE. 339 * PREREAD_ACTIVE.
337 * In stripe_handle, if we find pre-reading is necessary, we do it if 340 * In stripe_handle, if we find pre-reading is necessary, we do it if
338 * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue. 341 * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue.
339 * HANDLE gets cleared if stripe_handle leave nothing locked. 342 * HANDLE gets cleared if stripe_handle leaves nothing locked.
340 */ 343 */
341 344
342 345
@@ -399,7 +402,7 @@ struct raid5_private_data {
399 * (fresh device added). 402 * (fresh device added).
400 * Cleared when a sync completes. 403 * Cleared when a sync completes.
401 */ 404 */
402 405 int recovery_disabled;
403 /* per cpu variables */ 406 /* per cpu variables */
404 struct raid5_percpu { 407 struct raid5_percpu {
405 struct page *spare_page; /* Used when checking P/Q in raid6 */ 408 struct page *spare_page; /* Used when checking P/Q in raid6 */