diff options
author | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2008-02-07 22:30:50 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2008-02-07 22:30:50 -0500 |
commit | a4ffc0a0b240a29cbe489f6db9dae112a49ef1c1 (patch) | |
tree | 9719c706444f4b720aff2bb4bdf23a4be3f4b1e3 /drivers | |
parent | d7511ec8115487ccea2ce93bf58d5e5cd2c1c0a3 (diff) | |
parent | af195ac82e38ba802fd86b5a014ed05ef6dd88bb (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-2.6-dm
* git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-2.6-dm: (44 commits)
dm raid1: report fault status
dm raid1: handle read failures
dm raid1: fix EIO after log failure
dm raid1: handle recovery failures
dm raid1: handle write failures
dm snapshot: combine consecutive exceptions in memory
dm: stripe enhanced status return
dm: stripe trigger event on failure
dm log: auto load modules
dm: move deferred bio flushing to workqueue
dm crypt: use async crypto
dm crypt: prepare async callback fn
dm crypt: add completion for async
dm crypt: add async request mempool
dm crypt: extract scatterlist processing
dm crypt: tidy io ref counting
dm crypt: introduce crypt_write_io_loop
dm crypt: abstract crypt_write_done
dm crypt: store sector mapping in dm_crypt_io
dm crypt: move queue functions
...
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/md/Kconfig | 24 | ||||
-rw-r--r-- | drivers/md/dm-crypt.c | 486 | ||||
-rw-r--r-- | drivers/md/dm-exception-store.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-ioctl.c | 32 | ||||
-rw-r--r-- | drivers/md/dm-log.c | 51 | ||||
-rw-r--r-- | drivers/md/dm-mpath.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-raid1.c | 664 | ||||
-rw-r--r-- | drivers/md/dm-snap.c | 95 | ||||
-rw-r--r-- | drivers/md/dm-snap.h | 50 | ||||
-rw-r--r-- | drivers/md/dm-stripe.c | 105 | ||||
-rw-r--r-- | drivers/md/dm-table.c | 20 | ||||
-rw-r--r-- | drivers/md/dm.c | 238 |
12 files changed, 1361 insertions, 408 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 3fa7c77d9bd9..610af916891e 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig | |||
@@ -204,7 +204,7 @@ config BLK_DEV_DM | |||
204 | 204 | ||
205 | config DM_DEBUG | 205 | config DM_DEBUG |
206 | boolean "Device mapper debugging support" | 206 | boolean "Device mapper debugging support" |
207 | depends on BLK_DEV_DM && EXPERIMENTAL | 207 | depends on BLK_DEV_DM |
208 | ---help--- | 208 | ---help--- |
209 | Enable this for messages that may help debug device-mapper problems. | 209 | Enable this for messages that may help debug device-mapper problems. |
210 | 210 | ||
@@ -212,7 +212,7 @@ config DM_DEBUG | |||
212 | 212 | ||
213 | config DM_CRYPT | 213 | config DM_CRYPT |
214 | tristate "Crypt target support" | 214 | tristate "Crypt target support" |
215 | depends on BLK_DEV_DM && EXPERIMENTAL | 215 | depends on BLK_DEV_DM |
216 | select CRYPTO | 216 | select CRYPTO |
217 | select CRYPTO_CBC | 217 | select CRYPTO_CBC |
218 | ---help--- | 218 | ---help--- |
@@ -230,34 +230,34 @@ config DM_CRYPT | |||
230 | If unsure, say N. | 230 | If unsure, say N. |
231 | 231 | ||
232 | config DM_SNAPSHOT | 232 | config DM_SNAPSHOT |
233 | tristate "Snapshot target (EXPERIMENTAL)" | 233 | tristate "Snapshot target" |
234 | depends on BLK_DEV_DM && EXPERIMENTAL | 234 | depends on BLK_DEV_DM |
235 | ---help--- | 235 | ---help--- |
236 | Allow volume managers to take writable snapshots of a device. | 236 | Allow volume managers to take writable snapshots of a device. |
237 | 237 | ||
238 | config DM_MIRROR | 238 | config DM_MIRROR |
239 | tristate "Mirror target (EXPERIMENTAL)" | 239 | tristate "Mirror target" |
240 | depends on BLK_DEV_DM && EXPERIMENTAL | 240 | depends on BLK_DEV_DM |
241 | ---help--- | 241 | ---help--- |
242 | Allow volume managers to mirror logical volumes, also | 242 | Allow volume managers to mirror logical volumes, also |
243 | needed for live data migration tools such as 'pvmove'. | 243 | needed for live data migration tools such as 'pvmove'. |
244 | 244 | ||
245 | config DM_ZERO | 245 | config DM_ZERO |
246 | tristate "Zero target (EXPERIMENTAL)" | 246 | tristate "Zero target" |
247 | depends on BLK_DEV_DM && EXPERIMENTAL | 247 | depends on BLK_DEV_DM |
248 | ---help--- | 248 | ---help--- |
249 | A target that discards writes, and returns all zeroes for | 249 | A target that discards writes, and returns all zeroes for |
250 | reads. Useful in some recovery situations. | 250 | reads. Useful in some recovery situations. |
251 | 251 | ||
252 | config DM_MULTIPATH | 252 | config DM_MULTIPATH |
253 | tristate "Multipath target (EXPERIMENTAL)" | 253 | tristate "Multipath target" |
254 | depends on BLK_DEV_DM && EXPERIMENTAL | 254 | depends on BLK_DEV_DM |
255 | ---help--- | 255 | ---help--- |
256 | Allow volume managers to support multipath hardware. | 256 | Allow volume managers to support multipath hardware. |
257 | 257 | ||
258 | config DM_MULTIPATH_EMC | 258 | config DM_MULTIPATH_EMC |
259 | tristate "EMC CX/AX multipath support (EXPERIMENTAL)" | 259 | tristate "EMC CX/AX multipath support" |
260 | depends on DM_MULTIPATH && BLK_DEV_DM && EXPERIMENTAL | 260 | depends on DM_MULTIPATH && BLK_DEV_DM |
261 | ---help--- | 261 | ---help--- |
262 | Multipath support for EMC CX/AX series hardware. | 262 | Multipath support for EMC CX/AX series hardware. |
263 | 263 | ||
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 6b66ee46b87d..b04f98df94ea 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c | |||
@@ -1,11 +1,12 @@ | |||
1 | /* | 1 | /* |
2 | * Copyright (C) 2003 Christophe Saout <christophe@saout.de> | 2 | * Copyright (C) 2003 Christophe Saout <christophe@saout.de> |
3 | * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org> | 3 | * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org> |
4 | * Copyright (C) 2006 Red Hat, Inc. All rights reserved. | 4 | * Copyright (C) 2006-2007 Red Hat, Inc. All rights reserved. |
5 | * | 5 | * |
6 | * This file is released under the GPL. | 6 | * This file is released under the GPL. |
7 | */ | 7 | */ |
8 | 8 | ||
9 | #include <linux/completion.h> | ||
9 | #include <linux/err.h> | 10 | #include <linux/err.h> |
10 | #include <linux/module.h> | 11 | #include <linux/module.h> |
11 | #include <linux/init.h> | 12 | #include <linux/init.h> |
@@ -28,20 +29,10 @@ | |||
28 | #define MESG_STR(x) x, sizeof(x) | 29 | #define MESG_STR(x) x, sizeof(x) |
29 | 30 | ||
30 | /* | 31 | /* |
31 | * per bio private data | ||
32 | */ | ||
33 | struct dm_crypt_io { | ||
34 | struct dm_target *target; | ||
35 | struct bio *base_bio; | ||
36 | struct work_struct work; | ||
37 | atomic_t pending; | ||
38 | int error; | ||
39 | }; | ||
40 | |||
41 | /* | ||
42 | * context holding the current state of a multi-part conversion | 32 | * context holding the current state of a multi-part conversion |
43 | */ | 33 | */ |
44 | struct convert_context { | 34 | struct convert_context { |
35 | struct completion restart; | ||
45 | struct bio *bio_in; | 36 | struct bio *bio_in; |
46 | struct bio *bio_out; | 37 | struct bio *bio_out; |
47 | unsigned int offset_in; | 38 | unsigned int offset_in; |
@@ -49,7 +40,27 @@ struct convert_context { | |||
49 | unsigned int idx_in; | 40 | unsigned int idx_in; |
50 | unsigned int idx_out; | 41 | unsigned int idx_out; |
51 | sector_t sector; | 42 | sector_t sector; |
52 | int write; | 43 | atomic_t pending; |
44 | }; | ||
45 | |||
46 | /* | ||
47 | * per bio private data | ||
48 | */ | ||
49 | struct dm_crypt_io { | ||
50 | struct dm_target *target; | ||
51 | struct bio *base_bio; | ||
52 | struct work_struct work; | ||
53 | |||
54 | struct convert_context ctx; | ||
55 | |||
56 | atomic_t pending; | ||
57 | int error; | ||
58 | sector_t sector; | ||
59 | }; | ||
60 | |||
61 | struct dm_crypt_request { | ||
62 | struct scatterlist sg_in; | ||
63 | struct scatterlist sg_out; | ||
53 | }; | 64 | }; |
54 | 65 | ||
55 | struct crypt_config; | 66 | struct crypt_config; |
@@ -72,10 +83,11 @@ struct crypt_config { | |||
72 | sector_t start; | 83 | sector_t start; |
73 | 84 | ||
74 | /* | 85 | /* |
75 | * pool for per bio private data and | 86 | * pool for per bio private data, crypto requests and |
76 | * for encryption buffer pages | 87 | * encryption requeusts/buffer pages |
77 | */ | 88 | */ |
78 | mempool_t *io_pool; | 89 | mempool_t *io_pool; |
90 | mempool_t *req_pool; | ||
79 | mempool_t *page_pool; | 91 | mempool_t *page_pool; |
80 | struct bio_set *bs; | 92 | struct bio_set *bs; |
81 | 93 | ||
@@ -93,9 +105,25 @@ struct crypt_config { | |||
93 | sector_t iv_offset; | 105 | sector_t iv_offset; |
94 | unsigned int iv_size; | 106 | unsigned int iv_size; |
95 | 107 | ||
108 | /* | ||
109 | * Layout of each crypto request: | ||
110 | * | ||
111 | * struct ablkcipher_request | ||
112 | * context | ||
113 | * padding | ||
114 | * struct dm_crypt_request | ||
115 | * padding | ||
116 | * IV | ||
117 | * | ||
118 | * The padding is added so that dm_crypt_request and the IV are | ||
119 | * correctly aligned. | ||
120 | */ | ||
121 | unsigned int dmreq_start; | ||
122 | struct ablkcipher_request *req; | ||
123 | |||
96 | char cipher[CRYPTO_MAX_ALG_NAME]; | 124 | char cipher[CRYPTO_MAX_ALG_NAME]; |
97 | char chainmode[CRYPTO_MAX_ALG_NAME]; | 125 | char chainmode[CRYPTO_MAX_ALG_NAME]; |
98 | struct crypto_blkcipher *tfm; | 126 | struct crypto_ablkcipher *tfm; |
99 | unsigned long flags; | 127 | unsigned long flags; |
100 | unsigned int key_size; | 128 | unsigned int key_size; |
101 | u8 key[0]; | 129 | u8 key[0]; |
@@ -108,6 +136,7 @@ struct crypt_config { | |||
108 | static struct kmem_cache *_crypt_io_pool; | 136 | static struct kmem_cache *_crypt_io_pool; |
109 | 137 | ||
110 | static void clone_init(struct dm_crypt_io *, struct bio *); | 138 | static void clone_init(struct dm_crypt_io *, struct bio *); |
139 | static void kcryptd_queue_crypt(struct dm_crypt_io *io); | ||
111 | 140 | ||
112 | /* | 141 | /* |
113 | * Different IV generation algorithms: | 142 | * Different IV generation algorithms: |
@@ -188,7 +217,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, | |||
188 | return PTR_ERR(essiv_tfm); | 217 | return PTR_ERR(essiv_tfm); |
189 | } | 218 | } |
190 | if (crypto_cipher_blocksize(essiv_tfm) != | 219 | if (crypto_cipher_blocksize(essiv_tfm) != |
191 | crypto_blkcipher_ivsize(cc->tfm)) { | 220 | crypto_ablkcipher_ivsize(cc->tfm)) { |
192 | ti->error = "Block size of ESSIV cipher does " | 221 | ti->error = "Block size of ESSIV cipher does " |
193 | "not match IV size of block cipher"; | 222 | "not match IV size of block cipher"; |
194 | crypto_free_cipher(essiv_tfm); | 223 | crypto_free_cipher(essiv_tfm); |
@@ -225,7 +254,7 @@ static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector) | |||
225 | static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti, | 254 | static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti, |
226 | const char *opts) | 255 | const char *opts) |
227 | { | 256 | { |
228 | unsigned int bs = crypto_blkcipher_blocksize(cc->tfm); | 257 | unsigned bs = crypto_ablkcipher_blocksize(cc->tfm); |
229 | int log = ilog2(bs); | 258 | int log = ilog2(bs); |
230 | 259 | ||
231 | /* we need to calculate how far we must shift the sector count | 260 | /* we need to calculate how far we must shift the sector count |
@@ -289,42 +318,10 @@ static struct crypt_iv_operations crypt_iv_null_ops = { | |||
289 | .generator = crypt_iv_null_gen | 318 | .generator = crypt_iv_null_gen |
290 | }; | 319 | }; |
291 | 320 | ||
292 | static int | ||
293 | crypt_convert_scatterlist(struct crypt_config *cc, struct scatterlist *out, | ||
294 | struct scatterlist *in, unsigned int length, | ||
295 | int write, sector_t sector) | ||
296 | { | ||
297 | u8 iv[cc->iv_size] __attribute__ ((aligned(__alignof__(u64)))); | ||
298 | struct blkcipher_desc desc = { | ||
299 | .tfm = cc->tfm, | ||
300 | .info = iv, | ||
301 | .flags = CRYPTO_TFM_REQ_MAY_SLEEP, | ||
302 | }; | ||
303 | int r; | ||
304 | |||
305 | if (cc->iv_gen_ops) { | ||
306 | r = cc->iv_gen_ops->generator(cc, iv, sector); | ||
307 | if (r < 0) | ||
308 | return r; | ||
309 | |||
310 | if (write) | ||
311 | r = crypto_blkcipher_encrypt_iv(&desc, out, in, length); | ||
312 | else | ||
313 | r = crypto_blkcipher_decrypt_iv(&desc, out, in, length); | ||
314 | } else { | ||
315 | if (write) | ||
316 | r = crypto_blkcipher_encrypt(&desc, out, in, length); | ||
317 | else | ||
318 | r = crypto_blkcipher_decrypt(&desc, out, in, length); | ||
319 | } | ||
320 | |||
321 | return r; | ||
322 | } | ||
323 | |||
324 | static void crypt_convert_init(struct crypt_config *cc, | 321 | static void crypt_convert_init(struct crypt_config *cc, |
325 | struct convert_context *ctx, | 322 | struct convert_context *ctx, |
326 | struct bio *bio_out, struct bio *bio_in, | 323 | struct bio *bio_out, struct bio *bio_in, |
327 | sector_t sector, int write) | 324 | sector_t sector) |
328 | { | 325 | { |
329 | ctx->bio_in = bio_in; | 326 | ctx->bio_in = bio_in; |
330 | ctx->bio_out = bio_out; | 327 | ctx->bio_out = bio_out; |
@@ -333,7 +330,79 @@ static void crypt_convert_init(struct crypt_config *cc, | |||
333 | ctx->idx_in = bio_in ? bio_in->bi_idx : 0; | 330 | ctx->idx_in = bio_in ? bio_in->bi_idx : 0; |
334 | ctx->idx_out = bio_out ? bio_out->bi_idx : 0; | 331 | ctx->idx_out = bio_out ? bio_out->bi_idx : 0; |
335 | ctx->sector = sector + cc->iv_offset; | 332 | ctx->sector = sector + cc->iv_offset; |
336 | ctx->write = write; | 333 | init_completion(&ctx->restart); |
334 | /* | ||
335 | * Crypto operation can be asynchronous, | ||
336 | * ctx->pending is increased after request submission. | ||
337 | * We need to ensure that we don't call the crypt finish | ||
338 | * operation before pending got incremented | ||
339 | * (dependent on crypt submission return code). | ||
340 | */ | ||
341 | atomic_set(&ctx->pending, 2); | ||
342 | } | ||
343 | |||
344 | static int crypt_convert_block(struct crypt_config *cc, | ||
345 | struct convert_context *ctx, | ||
346 | struct ablkcipher_request *req) | ||
347 | { | ||
348 | struct bio_vec *bv_in = bio_iovec_idx(ctx->bio_in, ctx->idx_in); | ||
349 | struct bio_vec *bv_out = bio_iovec_idx(ctx->bio_out, ctx->idx_out); | ||
350 | struct dm_crypt_request *dmreq; | ||
351 | u8 *iv; | ||
352 | int r = 0; | ||
353 | |||
354 | dmreq = (struct dm_crypt_request *)((char *)req + cc->dmreq_start); | ||
355 | iv = (u8 *)ALIGN((unsigned long)(dmreq + 1), | ||
356 | crypto_ablkcipher_alignmask(cc->tfm) + 1); | ||
357 | |||
358 | sg_init_table(&dmreq->sg_in, 1); | ||
359 | sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT, | ||
360 | bv_in->bv_offset + ctx->offset_in); | ||
361 | |||
362 | sg_init_table(&dmreq->sg_out, 1); | ||
363 | sg_set_page(&dmreq->sg_out, bv_out->bv_page, 1 << SECTOR_SHIFT, | ||
364 | bv_out->bv_offset + ctx->offset_out); | ||
365 | |||
366 | ctx->offset_in += 1 << SECTOR_SHIFT; | ||
367 | if (ctx->offset_in >= bv_in->bv_len) { | ||
368 | ctx->offset_in = 0; | ||
369 | ctx->idx_in++; | ||
370 | } | ||
371 | |||
372 | ctx->offset_out += 1 << SECTOR_SHIFT; | ||
373 | if (ctx->offset_out >= bv_out->bv_len) { | ||
374 | ctx->offset_out = 0; | ||
375 | ctx->idx_out++; | ||
376 | } | ||
377 | |||
378 | if (cc->iv_gen_ops) { | ||
379 | r = cc->iv_gen_ops->generator(cc, iv, ctx->sector); | ||
380 | if (r < 0) | ||
381 | return r; | ||
382 | } | ||
383 | |||
384 | ablkcipher_request_set_crypt(req, &dmreq->sg_in, &dmreq->sg_out, | ||
385 | 1 << SECTOR_SHIFT, iv); | ||
386 | |||
387 | if (bio_data_dir(ctx->bio_in) == WRITE) | ||
388 | r = crypto_ablkcipher_encrypt(req); | ||
389 | else | ||
390 | r = crypto_ablkcipher_decrypt(req); | ||
391 | |||
392 | return r; | ||
393 | } | ||
394 | |||
395 | static void kcryptd_async_done(struct crypto_async_request *async_req, | ||
396 | int error); | ||
397 | static void crypt_alloc_req(struct crypt_config *cc, | ||
398 | struct convert_context *ctx) | ||
399 | { | ||
400 | if (!cc->req) | ||
401 | cc->req = mempool_alloc(cc->req_pool, GFP_NOIO); | ||
402 | ablkcipher_request_set_tfm(cc->req, cc->tfm); | ||
403 | ablkcipher_request_set_callback(cc->req, CRYPTO_TFM_REQ_MAY_BACKLOG | | ||
404 | CRYPTO_TFM_REQ_MAY_SLEEP, | ||
405 | kcryptd_async_done, ctx); | ||
337 | } | 406 | } |
338 | 407 | ||
339 | /* | 408 | /* |
@@ -346,36 +415,38 @@ static int crypt_convert(struct crypt_config *cc, | |||
346 | 415 | ||
347 | while(ctx->idx_in < ctx->bio_in->bi_vcnt && | 416 | while(ctx->idx_in < ctx->bio_in->bi_vcnt && |
348 | ctx->idx_out < ctx->bio_out->bi_vcnt) { | 417 | ctx->idx_out < ctx->bio_out->bi_vcnt) { |
349 | struct bio_vec *bv_in = bio_iovec_idx(ctx->bio_in, ctx->idx_in); | ||
350 | struct bio_vec *bv_out = bio_iovec_idx(ctx->bio_out, ctx->idx_out); | ||
351 | struct scatterlist sg_in, sg_out; | ||
352 | |||
353 | sg_init_table(&sg_in, 1); | ||
354 | sg_set_page(&sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT, bv_in->bv_offset + ctx->offset_in); | ||
355 | |||
356 | sg_init_table(&sg_out, 1); | ||
357 | sg_set_page(&sg_out, bv_out->bv_page, 1 << SECTOR_SHIFT, bv_out->bv_offset + ctx->offset_out); | ||
358 | 418 | ||
359 | ctx->offset_in += sg_in.length; | 419 | crypt_alloc_req(cc, ctx); |
360 | if (ctx->offset_in >= bv_in->bv_len) { | 420 | |
361 | ctx->offset_in = 0; | 421 | r = crypt_convert_block(cc, ctx, cc->req); |
362 | ctx->idx_in++; | 422 | |
423 | switch (r) { | ||
424 | case -EBUSY: | ||
425 | wait_for_completion(&ctx->restart); | ||
426 | INIT_COMPLETION(ctx->restart); | ||
427 | /* fall through*/ | ||
428 | case -EINPROGRESS: | ||
429 | atomic_inc(&ctx->pending); | ||
430 | cc->req = NULL; | ||
431 | r = 0; | ||
432 | /* fall through*/ | ||
433 | case 0: | ||
434 | ctx->sector++; | ||
435 | continue; | ||
363 | } | 436 | } |
364 | 437 | ||
365 | ctx->offset_out += sg_out.length; | 438 | break; |
366 | if (ctx->offset_out >= bv_out->bv_len) { | ||
367 | ctx->offset_out = 0; | ||
368 | ctx->idx_out++; | ||
369 | } | ||
370 | |||
371 | r = crypt_convert_scatterlist(cc, &sg_out, &sg_in, sg_in.length, | ||
372 | ctx->write, ctx->sector); | ||
373 | if (r < 0) | ||
374 | break; | ||
375 | |||
376 | ctx->sector++; | ||
377 | } | 439 | } |
378 | 440 | ||
441 | /* | ||
442 | * If there are pending crypto operation run async | ||
443 | * code. Otherwise process return code synchronously. | ||
444 | * The step of 2 ensures that async finish doesn't | ||
445 | * call crypto finish too early. | ||
446 | */ | ||
447 | if (atomic_sub_return(2, &ctx->pending)) | ||
448 | return -EINPROGRESS; | ||
449 | |||
379 | return r; | 450 | return r; |
380 | } | 451 | } |
381 | 452 | ||
@@ -455,18 +526,14 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone) | |||
455 | * One of the bios was finished. Check for completion of | 526 | * One of the bios was finished. Check for completion of |
456 | * the whole request and correctly clean up the buffer. | 527 | * the whole request and correctly clean up the buffer. |
457 | */ | 528 | */ |
458 | static void crypt_dec_pending(struct dm_crypt_io *io, int error) | 529 | static void crypt_dec_pending(struct dm_crypt_io *io) |
459 | { | 530 | { |
460 | struct crypt_config *cc = (struct crypt_config *) io->target->private; | 531 | struct crypt_config *cc = io->target->private; |
461 | |||
462 | if (error < 0) | ||
463 | io->error = error; | ||
464 | 532 | ||
465 | if (!atomic_dec_and_test(&io->pending)) | 533 | if (!atomic_dec_and_test(&io->pending)) |
466 | return; | 534 | return; |
467 | 535 | ||
468 | bio_endio(io->base_bio, io->error); | 536 | bio_endio(io->base_bio, io->error); |
469 | |||
470 | mempool_free(io, cc->io_pool); | 537 | mempool_free(io, cc->io_pool); |
471 | } | 538 | } |
472 | 539 | ||
@@ -484,30 +551,11 @@ static void crypt_dec_pending(struct dm_crypt_io *io, int error) | |||
484 | * starved by new requests which can block in the first stages due | 551 | * starved by new requests which can block in the first stages due |
485 | * to memory allocation. | 552 | * to memory allocation. |
486 | */ | 553 | */ |
487 | static void kcryptd_do_work(struct work_struct *work); | ||
488 | static void kcryptd_do_crypt(struct work_struct *work); | ||
489 | |||
490 | static void kcryptd_queue_io(struct dm_crypt_io *io) | ||
491 | { | ||
492 | struct crypt_config *cc = io->target->private; | ||
493 | |||
494 | INIT_WORK(&io->work, kcryptd_do_work); | ||
495 | queue_work(cc->io_queue, &io->work); | ||
496 | } | ||
497 | |||
498 | static void kcryptd_queue_crypt(struct dm_crypt_io *io) | ||
499 | { | ||
500 | struct crypt_config *cc = io->target->private; | ||
501 | |||
502 | INIT_WORK(&io->work, kcryptd_do_crypt); | ||
503 | queue_work(cc->crypt_queue, &io->work); | ||
504 | } | ||
505 | |||
506 | static void crypt_endio(struct bio *clone, int error) | 554 | static void crypt_endio(struct bio *clone, int error) |
507 | { | 555 | { |
508 | struct dm_crypt_io *io = clone->bi_private; | 556 | struct dm_crypt_io *io = clone->bi_private; |
509 | struct crypt_config *cc = io->target->private; | 557 | struct crypt_config *cc = io->target->private; |
510 | unsigned read_io = bio_data_dir(clone) == READ; | 558 | unsigned rw = bio_data_dir(clone); |
511 | 559 | ||
512 | if (unlikely(!bio_flagged(clone, BIO_UPTODATE) && !error)) | 560 | if (unlikely(!bio_flagged(clone, BIO_UPTODATE) && !error)) |
513 | error = -EIO; | 561 | error = -EIO; |
@@ -515,21 +563,20 @@ static void crypt_endio(struct bio *clone, int error) | |||
515 | /* | 563 | /* |
516 | * free the processed pages | 564 | * free the processed pages |
517 | */ | 565 | */ |
518 | if (!read_io) { | 566 | if (rw == WRITE) |
519 | crypt_free_buffer_pages(cc, clone); | 567 | crypt_free_buffer_pages(cc, clone); |
520 | goto out; | 568 | |
569 | bio_put(clone); | ||
570 | |||
571 | if (rw == READ && !error) { | ||
572 | kcryptd_queue_crypt(io); | ||
573 | return; | ||
521 | } | 574 | } |
522 | 575 | ||
523 | if (unlikely(error)) | 576 | if (unlikely(error)) |
524 | goto out; | 577 | io->error = error; |
525 | |||
526 | bio_put(clone); | ||
527 | kcryptd_queue_crypt(io); | ||
528 | return; | ||
529 | 578 | ||
530 | out: | 579 | crypt_dec_pending(io); |
531 | bio_put(clone); | ||
532 | crypt_dec_pending(io, error); | ||
533 | } | 580 | } |
534 | 581 | ||
535 | static void clone_init(struct dm_crypt_io *io, struct bio *clone) | 582 | static void clone_init(struct dm_crypt_io *io, struct bio *clone) |
@@ -543,12 +590,11 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone) | |||
543 | clone->bi_destructor = dm_crypt_bio_destructor; | 590 | clone->bi_destructor = dm_crypt_bio_destructor; |
544 | } | 591 | } |
545 | 592 | ||
546 | static void process_read(struct dm_crypt_io *io) | 593 | static void kcryptd_io_read(struct dm_crypt_io *io) |
547 | { | 594 | { |
548 | struct crypt_config *cc = io->target->private; | 595 | struct crypt_config *cc = io->target->private; |
549 | struct bio *base_bio = io->base_bio; | 596 | struct bio *base_bio = io->base_bio; |
550 | struct bio *clone; | 597 | struct bio *clone; |
551 | sector_t sector = base_bio->bi_sector - io->target->begin; | ||
552 | 598 | ||
553 | atomic_inc(&io->pending); | 599 | atomic_inc(&io->pending); |
554 | 600 | ||
@@ -559,7 +605,8 @@ static void process_read(struct dm_crypt_io *io) | |||
559 | */ | 605 | */ |
560 | clone = bio_alloc_bioset(GFP_NOIO, bio_segments(base_bio), cc->bs); | 606 | clone = bio_alloc_bioset(GFP_NOIO, bio_segments(base_bio), cc->bs); |
561 | if (unlikely(!clone)) { | 607 | if (unlikely(!clone)) { |
562 | crypt_dec_pending(io, -ENOMEM); | 608 | io->error = -ENOMEM; |
609 | crypt_dec_pending(io); | ||
563 | return; | 610 | return; |
564 | } | 611 | } |
565 | 612 | ||
@@ -567,25 +614,71 @@ static void process_read(struct dm_crypt_io *io) | |||
567 | clone->bi_idx = 0; | 614 | clone->bi_idx = 0; |
568 | clone->bi_vcnt = bio_segments(base_bio); | 615 | clone->bi_vcnt = bio_segments(base_bio); |
569 | clone->bi_size = base_bio->bi_size; | 616 | clone->bi_size = base_bio->bi_size; |
570 | clone->bi_sector = cc->start + sector; | 617 | clone->bi_sector = cc->start + io->sector; |
571 | memcpy(clone->bi_io_vec, bio_iovec(base_bio), | 618 | memcpy(clone->bi_io_vec, bio_iovec(base_bio), |
572 | sizeof(struct bio_vec) * clone->bi_vcnt); | 619 | sizeof(struct bio_vec) * clone->bi_vcnt); |
573 | 620 | ||
574 | generic_make_request(clone); | 621 | generic_make_request(clone); |
575 | } | 622 | } |
576 | 623 | ||
577 | static void process_write(struct dm_crypt_io *io) | 624 | static void kcryptd_io_write(struct dm_crypt_io *io) |
625 | { | ||
626 | struct bio *clone = io->ctx.bio_out; | ||
627 | |||
628 | generic_make_request(clone); | ||
629 | } | ||
630 | |||
631 | static void kcryptd_io(struct work_struct *work) | ||
632 | { | ||
633 | struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work); | ||
634 | |||
635 | if (bio_data_dir(io->base_bio) == READ) | ||
636 | kcryptd_io_read(io); | ||
637 | else | ||
638 | kcryptd_io_write(io); | ||
639 | } | ||
640 | |||
641 | static void kcryptd_queue_io(struct dm_crypt_io *io) | ||
578 | { | 642 | { |
579 | struct crypt_config *cc = io->target->private; | 643 | struct crypt_config *cc = io->target->private; |
580 | struct bio *base_bio = io->base_bio; | ||
581 | struct bio *clone; | ||
582 | struct convert_context ctx; | ||
583 | unsigned remaining = base_bio->bi_size; | ||
584 | sector_t sector = base_bio->bi_sector - io->target->begin; | ||
585 | 644 | ||
586 | atomic_inc(&io->pending); | 645 | INIT_WORK(&io->work, kcryptd_io); |
646 | queue_work(cc->io_queue, &io->work); | ||
647 | } | ||
587 | 648 | ||
588 | crypt_convert_init(cc, &ctx, NULL, base_bio, sector, 1); | 649 | static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, |
650 | int error, int async) | ||
651 | { | ||
652 | struct bio *clone = io->ctx.bio_out; | ||
653 | struct crypt_config *cc = io->target->private; | ||
654 | |||
655 | if (unlikely(error < 0)) { | ||
656 | crypt_free_buffer_pages(cc, clone); | ||
657 | bio_put(clone); | ||
658 | io->error = -EIO; | ||
659 | return; | ||
660 | } | ||
661 | |||
662 | /* crypt_convert should have filled the clone bio */ | ||
663 | BUG_ON(io->ctx.idx_out < clone->bi_vcnt); | ||
664 | |||
665 | clone->bi_sector = cc->start + io->sector; | ||
666 | io->sector += bio_sectors(clone); | ||
667 | |||
668 | if (async) | ||
669 | kcryptd_queue_io(io); | ||
670 | else { | ||
671 | atomic_inc(&io->pending); | ||
672 | generic_make_request(clone); | ||
673 | } | ||
674 | } | ||
675 | |||
676 | static void kcryptd_crypt_write_convert_loop(struct dm_crypt_io *io) | ||
677 | { | ||
678 | struct crypt_config *cc = io->target->private; | ||
679 | struct bio *clone; | ||
680 | unsigned remaining = io->base_bio->bi_size; | ||
681 | int r; | ||
589 | 682 | ||
590 | /* | 683 | /* |
591 | * The allocated buffers can be smaller than the whole bio, | 684 | * The allocated buffers can be smaller than the whole bio, |
@@ -594,70 +687,110 @@ static void process_write(struct dm_crypt_io *io) | |||
594 | while (remaining) { | 687 | while (remaining) { |
595 | clone = crypt_alloc_buffer(io, remaining); | 688 | clone = crypt_alloc_buffer(io, remaining); |
596 | if (unlikely(!clone)) { | 689 | if (unlikely(!clone)) { |
597 | crypt_dec_pending(io, -ENOMEM); | 690 | io->error = -ENOMEM; |
598 | return; | 691 | return; |
599 | } | 692 | } |
600 | 693 | ||
601 | ctx.bio_out = clone; | 694 | io->ctx.bio_out = clone; |
602 | ctx.idx_out = 0; | 695 | io->ctx.idx_out = 0; |
603 | 696 | ||
604 | if (unlikely(crypt_convert(cc, &ctx) < 0)) { | ||
605 | crypt_free_buffer_pages(cc, clone); | ||
606 | bio_put(clone); | ||
607 | crypt_dec_pending(io, -EIO); | ||
608 | return; | ||
609 | } | ||
610 | |||
611 | /* crypt_convert should have filled the clone bio */ | ||
612 | BUG_ON(ctx.idx_out < clone->bi_vcnt); | ||
613 | |||
614 | clone->bi_sector = cc->start + sector; | ||
615 | remaining -= clone->bi_size; | 697 | remaining -= clone->bi_size; |
616 | sector += bio_sectors(clone); | ||
617 | 698 | ||
618 | /* Grab another reference to the io struct | 699 | r = crypt_convert(cc, &io->ctx); |
619 | * before we kick off the request */ | ||
620 | if (remaining) | ||
621 | atomic_inc(&io->pending); | ||
622 | 700 | ||
623 | generic_make_request(clone); | 701 | if (r != -EINPROGRESS) { |
624 | 702 | kcryptd_crypt_write_io_submit(io, r, 0); | |
625 | /* Do not reference clone after this - it | 703 | if (unlikely(r < 0)) |
626 | * may be gone already. */ | 704 | return; |
705 | } else | ||
706 | atomic_inc(&io->pending); | ||
627 | 707 | ||
628 | /* out of memory -> run queues */ | 708 | /* out of memory -> run queues */ |
629 | if (remaining) | 709 | if (unlikely(remaining)) |
630 | congestion_wait(WRITE, HZ/100); | 710 | congestion_wait(WRITE, HZ/100); |
631 | } | 711 | } |
632 | } | 712 | } |
633 | 713 | ||
634 | static void process_read_endio(struct dm_crypt_io *io) | 714 | static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) |
635 | { | 715 | { |
636 | struct crypt_config *cc = io->target->private; | 716 | struct crypt_config *cc = io->target->private; |
637 | struct convert_context ctx; | ||
638 | 717 | ||
639 | crypt_convert_init(cc, &ctx, io->base_bio, io->base_bio, | 718 | /* |
640 | io->base_bio->bi_sector - io->target->begin, 0); | 719 | * Prevent io from disappearing until this function completes. |
720 | */ | ||
721 | atomic_inc(&io->pending); | ||
722 | |||
723 | crypt_convert_init(cc, &io->ctx, NULL, io->base_bio, io->sector); | ||
724 | kcryptd_crypt_write_convert_loop(io); | ||
641 | 725 | ||
642 | crypt_dec_pending(io, crypt_convert(cc, &ctx)); | 726 | crypt_dec_pending(io); |
643 | } | 727 | } |
644 | 728 | ||
645 | static void kcryptd_do_work(struct work_struct *work) | 729 | static void kcryptd_crypt_read_done(struct dm_crypt_io *io, int error) |
646 | { | 730 | { |
647 | struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work); | 731 | if (unlikely(error < 0)) |
732 | io->error = -EIO; | ||
733 | |||
734 | crypt_dec_pending(io); | ||
735 | } | ||
736 | |||
737 | static void kcryptd_crypt_read_convert(struct dm_crypt_io *io) | ||
738 | { | ||
739 | struct crypt_config *cc = io->target->private; | ||
740 | int r = 0; | ||
741 | |||
742 | atomic_inc(&io->pending); | ||
743 | |||
744 | crypt_convert_init(cc, &io->ctx, io->base_bio, io->base_bio, | ||
745 | io->sector); | ||
746 | |||
747 | r = crypt_convert(cc, &io->ctx); | ||
748 | |||
749 | if (r != -EINPROGRESS) | ||
750 | kcryptd_crypt_read_done(io, r); | ||
751 | |||
752 | crypt_dec_pending(io); | ||
753 | } | ||
754 | |||
755 | static void kcryptd_async_done(struct crypto_async_request *async_req, | ||
756 | int error) | ||
757 | { | ||
758 | struct convert_context *ctx = async_req->data; | ||
759 | struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx); | ||
760 | struct crypt_config *cc = io->target->private; | ||
761 | |||
762 | if (error == -EINPROGRESS) { | ||
763 | complete(&ctx->restart); | ||
764 | return; | ||
765 | } | ||
766 | |||
767 | mempool_free(ablkcipher_request_cast(async_req), cc->req_pool); | ||
768 | |||
769 | if (!atomic_dec_and_test(&ctx->pending)) | ||
770 | return; | ||
648 | 771 | ||
649 | if (bio_data_dir(io->base_bio) == READ) | 772 | if (bio_data_dir(io->base_bio) == READ) |
650 | process_read(io); | 773 | kcryptd_crypt_read_done(io, error); |
774 | else | ||
775 | kcryptd_crypt_write_io_submit(io, error, 1); | ||
651 | } | 776 | } |
652 | 777 | ||
653 | static void kcryptd_do_crypt(struct work_struct *work) | 778 | static void kcryptd_crypt(struct work_struct *work) |
654 | { | 779 | { |
655 | struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work); | 780 | struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work); |
656 | 781 | ||
657 | if (bio_data_dir(io->base_bio) == READ) | 782 | if (bio_data_dir(io->base_bio) == READ) |
658 | process_read_endio(io); | 783 | kcryptd_crypt_read_convert(io); |
659 | else | 784 | else |
660 | process_write(io); | 785 | kcryptd_crypt_write_convert(io); |
786 | } | ||
787 | |||
788 | static void kcryptd_queue_crypt(struct dm_crypt_io *io) | ||
789 | { | ||
790 | struct crypt_config *cc = io->target->private; | ||
791 | |||
792 | INIT_WORK(&io->work, kcryptd_crypt); | ||
793 | queue_work(cc->crypt_queue, &io->work); | ||
661 | } | 794 | } |
662 | 795 | ||
663 | /* | 796 | /* |
@@ -733,7 +866,7 @@ static int crypt_wipe_key(struct crypt_config *cc) | |||
733 | static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | 866 | static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) |
734 | { | 867 | { |
735 | struct crypt_config *cc; | 868 | struct crypt_config *cc; |
736 | struct crypto_blkcipher *tfm; | 869 | struct crypto_ablkcipher *tfm; |
737 | char *tmp; | 870 | char *tmp; |
738 | char *cipher; | 871 | char *cipher; |
739 | char *chainmode; | 872 | char *chainmode; |
@@ -787,7 +920,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
787 | goto bad_cipher; | 920 | goto bad_cipher; |
788 | } | 921 | } |
789 | 922 | ||
790 | tfm = crypto_alloc_blkcipher(cc->cipher, 0, CRYPTO_ALG_ASYNC); | 923 | tfm = crypto_alloc_ablkcipher(cc->cipher, 0, 0); |
791 | if (IS_ERR(tfm)) { | 924 | if (IS_ERR(tfm)) { |
792 | ti->error = "Error allocating crypto tfm"; | 925 | ti->error = "Error allocating crypto tfm"; |
793 | goto bad_cipher; | 926 | goto bad_cipher; |
@@ -821,7 +954,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
821 | cc->iv_gen_ops->ctr(cc, ti, ivopts) < 0) | 954 | cc->iv_gen_ops->ctr(cc, ti, ivopts) < 0) |
822 | goto bad_ivmode; | 955 | goto bad_ivmode; |
823 | 956 | ||
824 | cc->iv_size = crypto_blkcipher_ivsize(tfm); | 957 | cc->iv_size = crypto_ablkcipher_ivsize(tfm); |
825 | if (cc->iv_size) | 958 | if (cc->iv_size) |
826 | /* at least a 64 bit sector number should fit in our buffer */ | 959 | /* at least a 64 bit sector number should fit in our buffer */ |
827 | cc->iv_size = max(cc->iv_size, | 960 | cc->iv_size = max(cc->iv_size, |
@@ -841,6 +974,20 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
841 | goto bad_slab_pool; | 974 | goto bad_slab_pool; |
842 | } | 975 | } |
843 | 976 | ||
977 | cc->dmreq_start = sizeof(struct ablkcipher_request); | ||
978 | cc->dmreq_start += crypto_ablkcipher_reqsize(tfm); | ||
979 | cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment()); | ||
980 | cc->dmreq_start += crypto_ablkcipher_alignmask(tfm) & | ||
981 | ~(crypto_tfm_ctx_alignment() - 1); | ||
982 | |||
983 | cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start + | ||
984 | sizeof(struct dm_crypt_request) + cc->iv_size); | ||
985 | if (!cc->req_pool) { | ||
986 | ti->error = "Cannot allocate crypt request mempool"; | ||
987 | goto bad_req_pool; | ||
988 | } | ||
989 | cc->req = NULL; | ||
990 | |||
844 | cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0); | 991 | cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0); |
845 | if (!cc->page_pool) { | 992 | if (!cc->page_pool) { |
846 | ti->error = "Cannot allocate page mempool"; | 993 | ti->error = "Cannot allocate page mempool"; |
@@ -853,7 +1000,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
853 | goto bad_bs; | 1000 | goto bad_bs; |
854 | } | 1001 | } |
855 | 1002 | ||
856 | if (crypto_blkcipher_setkey(tfm, cc->key, key_size) < 0) { | 1003 | if (crypto_ablkcipher_setkey(tfm, cc->key, key_size) < 0) { |
857 | ti->error = "Error setting key"; | 1004 | ti->error = "Error setting key"; |
858 | goto bad_device; | 1005 | goto bad_device; |
859 | } | 1006 | } |
@@ -914,12 +1061,14 @@ bad_device: | |||
914 | bad_bs: | 1061 | bad_bs: |
915 | mempool_destroy(cc->page_pool); | 1062 | mempool_destroy(cc->page_pool); |
916 | bad_page_pool: | 1063 | bad_page_pool: |
1064 | mempool_destroy(cc->req_pool); | ||
1065 | bad_req_pool: | ||
917 | mempool_destroy(cc->io_pool); | 1066 | mempool_destroy(cc->io_pool); |
918 | bad_slab_pool: | 1067 | bad_slab_pool: |
919 | if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) | 1068 | if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) |
920 | cc->iv_gen_ops->dtr(cc); | 1069 | cc->iv_gen_ops->dtr(cc); |
921 | bad_ivmode: | 1070 | bad_ivmode: |
922 | crypto_free_blkcipher(tfm); | 1071 | crypto_free_ablkcipher(tfm); |
923 | bad_cipher: | 1072 | bad_cipher: |
924 | /* Must zero key material before freeing */ | 1073 | /* Must zero key material before freeing */ |
925 | memset(cc, 0, sizeof(*cc) + cc->key_size * sizeof(u8)); | 1074 | memset(cc, 0, sizeof(*cc) + cc->key_size * sizeof(u8)); |
@@ -934,14 +1083,18 @@ static void crypt_dtr(struct dm_target *ti) | |||
934 | destroy_workqueue(cc->io_queue); | 1083 | destroy_workqueue(cc->io_queue); |
935 | destroy_workqueue(cc->crypt_queue); | 1084 | destroy_workqueue(cc->crypt_queue); |
936 | 1085 | ||
1086 | if (cc->req) | ||
1087 | mempool_free(cc->req, cc->req_pool); | ||
1088 | |||
937 | bioset_free(cc->bs); | 1089 | bioset_free(cc->bs); |
938 | mempool_destroy(cc->page_pool); | 1090 | mempool_destroy(cc->page_pool); |
1091 | mempool_destroy(cc->req_pool); | ||
939 | mempool_destroy(cc->io_pool); | 1092 | mempool_destroy(cc->io_pool); |
940 | 1093 | ||
941 | kfree(cc->iv_mode); | 1094 | kfree(cc->iv_mode); |
942 | if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) | 1095 | if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) |
943 | cc->iv_gen_ops->dtr(cc); | 1096 | cc->iv_gen_ops->dtr(cc); |
944 | crypto_free_blkcipher(cc->tfm); | 1097 | crypto_free_ablkcipher(cc->tfm); |
945 | dm_put_device(ti, cc->dev); | 1098 | dm_put_device(ti, cc->dev); |
946 | 1099 | ||
947 | /* Must zero key material before freeing */ | 1100 | /* Must zero key material before freeing */ |
@@ -958,6 +1111,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio, | |||
958 | io = mempool_alloc(cc->io_pool, GFP_NOIO); | 1111 | io = mempool_alloc(cc->io_pool, GFP_NOIO); |
959 | io->target = ti; | 1112 | io->target = ti; |
960 | io->base_bio = bio; | 1113 | io->base_bio = bio; |
1114 | io->sector = bio->bi_sector - ti->begin; | ||
961 | io->error = 0; | 1115 | io->error = 0; |
962 | atomic_set(&io->pending, 0); | 1116 | atomic_set(&io->pending, 0); |
963 | 1117 | ||
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c index 8fe81e1807e0..5bbce29f143a 100644 --- a/drivers/md/dm-exception-store.c +++ b/drivers/md/dm-exception-store.c | |||
@@ -449,7 +449,7 @@ static void persistent_destroy(struct exception_store *store) | |||
449 | 449 | ||
450 | static int persistent_read_metadata(struct exception_store *store) | 450 | static int persistent_read_metadata(struct exception_store *store) |
451 | { | 451 | { |
452 | int r, new_snapshot; | 452 | int r, uninitialized_var(new_snapshot); |
453 | struct pstore *ps = get_info(store); | 453 | struct pstore *ps = get_info(store); |
454 | 454 | ||
455 | /* | 455 | /* |
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 9627fa0f9470..b262c0042de3 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c | |||
@@ -15,6 +15,7 @@ | |||
15 | #include <linux/slab.h> | 15 | #include <linux/slab.h> |
16 | #include <linux/dm-ioctl.h> | 16 | #include <linux/dm-ioctl.h> |
17 | #include <linux/hdreg.h> | 17 | #include <linux/hdreg.h> |
18 | #include <linux/compat.h> | ||
18 | 19 | ||
19 | #include <asm/uaccess.h> | 20 | #include <asm/uaccess.h> |
20 | 21 | ||
@@ -702,7 +703,7 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size) | |||
702 | int r; | 703 | int r; |
703 | char *new_name = (char *) param + param->data_start; | 704 | char *new_name = (char *) param + param->data_start; |
704 | 705 | ||
705 | if (new_name < (char *) param->data || | 706 | if (new_name < param->data || |
706 | invalid_str(new_name, (void *) param + param_size)) { | 707 | invalid_str(new_name, (void *) param + param_size)) { |
707 | DMWARN("Invalid new logical volume name supplied."); | 708 | DMWARN("Invalid new logical volume name supplied."); |
708 | return -EINVAL; | 709 | return -EINVAL; |
@@ -728,7 +729,7 @@ static int dev_set_geometry(struct dm_ioctl *param, size_t param_size) | |||
728 | if (!md) | 729 | if (!md) |
729 | return -ENXIO; | 730 | return -ENXIO; |
730 | 731 | ||
731 | if (geostr < (char *) param->data || | 732 | if (geostr < param->data || |
732 | invalid_str(geostr, (void *) param + param_size)) { | 733 | invalid_str(geostr, (void *) param + param_size)) { |
733 | DMWARN("Invalid geometry supplied."); | 734 | DMWARN("Invalid geometry supplied."); |
734 | goto out; | 735 | goto out; |
@@ -1350,10 +1351,10 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param) | |||
1350 | { | 1351 | { |
1351 | struct dm_ioctl tmp, *dmi; | 1352 | struct dm_ioctl tmp, *dmi; |
1352 | 1353 | ||
1353 | if (copy_from_user(&tmp, user, sizeof(tmp))) | 1354 | if (copy_from_user(&tmp, user, sizeof(tmp) - sizeof(tmp.data))) |
1354 | return -EFAULT; | 1355 | return -EFAULT; |
1355 | 1356 | ||
1356 | if (tmp.data_size < sizeof(tmp)) | 1357 | if (tmp.data_size < (sizeof(tmp) - sizeof(tmp.data))) |
1357 | return -EINVAL; | 1358 | return -EINVAL; |
1358 | 1359 | ||
1359 | dmi = vmalloc(tmp.data_size); | 1360 | dmi = vmalloc(tmp.data_size); |
@@ -1397,13 +1398,11 @@ static int validate_params(uint cmd, struct dm_ioctl *param) | |||
1397 | return 0; | 1398 | return 0; |
1398 | } | 1399 | } |
1399 | 1400 | ||
1400 | static int ctl_ioctl(struct inode *inode, struct file *file, | 1401 | static int ctl_ioctl(uint command, struct dm_ioctl __user *user) |
1401 | uint command, ulong u) | ||
1402 | { | 1402 | { |
1403 | int r = 0; | 1403 | int r = 0; |
1404 | unsigned int cmd; | 1404 | unsigned int cmd; |
1405 | struct dm_ioctl *param; | 1405 | struct dm_ioctl *uninitialized_var(param); |
1406 | struct dm_ioctl __user *user = (struct dm_ioctl __user *) u; | ||
1407 | ioctl_fn fn = NULL; | 1406 | ioctl_fn fn = NULL; |
1408 | size_t param_size; | 1407 | size_t param_size; |
1409 | 1408 | ||
@@ -1471,8 +1470,23 @@ static int ctl_ioctl(struct inode *inode, struct file *file, | |||
1471 | return r; | 1470 | return r; |
1472 | } | 1471 | } |
1473 | 1472 | ||
1473 | static long dm_ctl_ioctl(struct file *file, uint command, ulong u) | ||
1474 | { | ||
1475 | return (long)ctl_ioctl(command, (struct dm_ioctl __user *)u); | ||
1476 | } | ||
1477 | |||
1478 | #ifdef CONFIG_COMPAT | ||
1479 | static long dm_compat_ctl_ioctl(struct file *file, uint command, ulong u) | ||
1480 | { | ||
1481 | return (long)dm_ctl_ioctl(file, command, (ulong) compat_ptr(u)); | ||
1482 | } | ||
1483 | #else | ||
1484 | #define dm_compat_ctl_ioctl NULL | ||
1485 | #endif | ||
1486 | |||
1474 | static const struct file_operations _ctl_fops = { | 1487 | static const struct file_operations _ctl_fops = { |
1475 | .ioctl = ctl_ioctl, | 1488 | .unlocked_ioctl = dm_ctl_ioctl, |
1489 | .compat_ioctl = dm_compat_ctl_ioctl, | ||
1476 | .owner = THIS_MODULE, | 1490 | .owner = THIS_MODULE, |
1477 | }; | 1491 | }; |
1478 | 1492 | ||
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index 072ee4353eab..2a74b2142f50 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c | |||
@@ -41,7 +41,7 @@ int dm_unregister_dirty_log_type(struct dirty_log_type *type) | |||
41 | return 0; | 41 | return 0; |
42 | } | 42 | } |
43 | 43 | ||
44 | static struct dirty_log_type *get_type(const char *type_name) | 44 | static struct dirty_log_type *_get_type(const char *type_name) |
45 | { | 45 | { |
46 | struct dirty_log_type *type; | 46 | struct dirty_log_type *type; |
47 | 47 | ||
@@ -61,6 +61,55 @@ static struct dirty_log_type *get_type(const char *type_name) | |||
61 | return NULL; | 61 | return NULL; |
62 | } | 62 | } |
63 | 63 | ||
64 | /* | ||
65 | * get_type | ||
66 | * @type_name | ||
67 | * | ||
68 | * Attempt to retrieve the dirty_log_type by name. If not already | ||
69 | * available, attempt to load the appropriate module. | ||
70 | * | ||
71 | * Log modules are named "dm-log-" followed by the 'type_name'. | ||
72 | * Modules may contain multiple types. | ||
73 | * This function will first try the module "dm-log-<type_name>", | ||
74 | * then truncate 'type_name' on the last '-' and try again. | ||
75 | * | ||
76 | * For example, if type_name was "clustered-disk", it would search | ||
77 | * 'dm-log-clustered-disk' then 'dm-log-clustered'. | ||
78 | * | ||
79 | * Returns: dirty_log_type* on success, NULL on failure | ||
80 | */ | ||
81 | static struct dirty_log_type *get_type(const char *type_name) | ||
82 | { | ||
83 | char *p, *type_name_dup; | ||
84 | struct dirty_log_type *type; | ||
85 | |||
86 | type = _get_type(type_name); | ||
87 | if (type) | ||
88 | return type; | ||
89 | |||
90 | type_name_dup = kstrdup(type_name, GFP_KERNEL); | ||
91 | if (!type_name_dup) { | ||
92 | DMWARN("No memory left to attempt log module load for \"%s\"", | ||
93 | type_name); | ||
94 | return NULL; | ||
95 | } | ||
96 | |||
97 | while (request_module("dm-log-%s", type_name_dup) || | ||
98 | !(type = _get_type(type_name))) { | ||
99 | p = strrchr(type_name_dup, '-'); | ||
100 | if (!p) | ||
101 | break; | ||
102 | p[0] = '\0'; | ||
103 | } | ||
104 | |||
105 | if (!type) | ||
106 | DMWARN("Module for logging type \"%s\" not found.", type_name); | ||
107 | |||
108 | kfree(type_name_dup); | ||
109 | |||
110 | return type; | ||
111 | } | ||
112 | |||
64 | static void put_type(struct dirty_log_type *type) | 113 | static void put_type(struct dirty_log_type *type) |
65 | { | 114 | { |
66 | spin_lock(&_lock); | 115 | spin_lock(&_lock); |
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 24b2b1e32fae..e7ee59e655d5 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c | |||
@@ -106,7 +106,7 @@ typedef int (*action_fn) (struct pgpath *pgpath); | |||
106 | 106 | ||
107 | static struct kmem_cache *_mpio_cache; | 107 | static struct kmem_cache *_mpio_cache; |
108 | 108 | ||
109 | struct workqueue_struct *kmultipathd; | 109 | static struct workqueue_struct *kmultipathd; |
110 | static void process_queued_ios(struct work_struct *work); | 110 | static void process_queued_ios(struct work_struct *work); |
111 | static void trigger_event(struct work_struct *work); | 111 | static void trigger_event(struct work_struct *work); |
112 | 112 | ||
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 31123d4a6b9c..edc057f5cdcc 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c | |||
@@ -6,6 +6,7 @@ | |||
6 | 6 | ||
7 | #include "dm.h" | 7 | #include "dm.h" |
8 | #include "dm-bio-list.h" | 8 | #include "dm-bio-list.h" |
9 | #include "dm-bio-record.h" | ||
9 | #include "dm-io.h" | 10 | #include "dm-io.h" |
10 | #include "dm-log.h" | 11 | #include "dm-log.h" |
11 | #include "kcopyd.h" | 12 | #include "kcopyd.h" |
@@ -20,6 +21,7 @@ | |||
20 | #include <linux/vmalloc.h> | 21 | #include <linux/vmalloc.h> |
21 | #include <linux/workqueue.h> | 22 | #include <linux/workqueue.h> |
22 | #include <linux/log2.h> | 23 | #include <linux/log2.h> |
24 | #include <linux/hardirq.h> | ||
23 | 25 | ||
24 | #define DM_MSG_PREFIX "raid1" | 26 | #define DM_MSG_PREFIX "raid1" |
25 | #define DM_IO_PAGES 64 | 27 | #define DM_IO_PAGES 64 |
@@ -113,9 +115,16 @@ struct region { | |||
113 | /*----------------------------------------------------------------- | 115 | /*----------------------------------------------------------------- |
114 | * Mirror set structures. | 116 | * Mirror set structures. |
115 | *---------------------------------------------------------------*/ | 117 | *---------------------------------------------------------------*/ |
118 | enum dm_raid1_error { | ||
119 | DM_RAID1_WRITE_ERROR, | ||
120 | DM_RAID1_SYNC_ERROR, | ||
121 | DM_RAID1_READ_ERROR | ||
122 | }; | ||
123 | |||
116 | struct mirror { | 124 | struct mirror { |
117 | struct mirror_set *ms; | 125 | struct mirror_set *ms; |
118 | atomic_t error_count; | 126 | atomic_t error_count; |
127 | uint32_t error_type; | ||
119 | struct dm_dev *dev; | 128 | struct dm_dev *dev; |
120 | sector_t offset; | 129 | sector_t offset; |
121 | }; | 130 | }; |
@@ -127,21 +136,25 @@ struct mirror_set { | |||
127 | struct kcopyd_client *kcopyd_client; | 136 | struct kcopyd_client *kcopyd_client; |
128 | uint64_t features; | 137 | uint64_t features; |
129 | 138 | ||
130 | spinlock_t lock; /* protects the next two lists */ | 139 | spinlock_t lock; /* protects the lists */ |
131 | struct bio_list reads; | 140 | struct bio_list reads; |
132 | struct bio_list writes; | 141 | struct bio_list writes; |
142 | struct bio_list failures; | ||
133 | 143 | ||
134 | struct dm_io_client *io_client; | 144 | struct dm_io_client *io_client; |
145 | mempool_t *read_record_pool; | ||
135 | 146 | ||
136 | /* recovery */ | 147 | /* recovery */ |
137 | region_t nr_regions; | 148 | region_t nr_regions; |
138 | int in_sync; | 149 | int in_sync; |
139 | int log_failure; | 150 | int log_failure; |
151 | atomic_t suspend; | ||
140 | 152 | ||
141 | struct mirror *default_mirror; /* Default mirror */ | 153 | atomic_t default_mirror; /* Default mirror */ |
142 | 154 | ||
143 | struct workqueue_struct *kmirrord_wq; | 155 | struct workqueue_struct *kmirrord_wq; |
144 | struct work_struct kmirrord_work; | 156 | struct work_struct kmirrord_work; |
157 | struct work_struct trigger_event; | ||
145 | 158 | ||
146 | unsigned int nr_mirrors; | 159 | unsigned int nr_mirrors; |
147 | struct mirror mirror[0]; | 160 | struct mirror mirror[0]; |
@@ -362,6 +375,16 @@ static void complete_resync_work(struct region *reg, int success) | |||
362 | struct region_hash *rh = reg->rh; | 375 | struct region_hash *rh = reg->rh; |
363 | 376 | ||
364 | rh->log->type->set_region_sync(rh->log, reg->key, success); | 377 | rh->log->type->set_region_sync(rh->log, reg->key, success); |
378 | |||
379 | /* | ||
380 | * Dispatch the bios before we call 'wake_up_all'. | ||
381 | * This is important because if we are suspending, | ||
382 | * we want to know that recovery is complete and | ||
383 | * the work queue is flushed. If we wake_up_all | ||
384 | * before we dispatch_bios (queue bios and call wake()), | ||
385 | * then we risk suspending before the work queue | ||
386 | * has been properly flushed. | ||
387 | */ | ||
365 | dispatch_bios(rh->ms, ®->delayed_bios); | 388 | dispatch_bios(rh->ms, ®->delayed_bios); |
366 | if (atomic_dec_and_test(&rh->recovery_in_flight)) | 389 | if (atomic_dec_and_test(&rh->recovery_in_flight)) |
367 | wake_up_all(&_kmirrord_recovery_stopped); | 390 | wake_up_all(&_kmirrord_recovery_stopped); |
@@ -626,24 +649,101 @@ static void rh_start_recovery(struct region_hash *rh) | |||
626 | wake(rh->ms); | 649 | wake(rh->ms); |
627 | } | 650 | } |
628 | 651 | ||
652 | #define MIN_READ_RECORDS 20 | ||
653 | struct dm_raid1_read_record { | ||
654 | struct mirror *m; | ||
655 | struct dm_bio_details details; | ||
656 | }; | ||
657 | |||
629 | /* | 658 | /* |
630 | * Every mirror should look like this one. | 659 | * Every mirror should look like this one. |
631 | */ | 660 | */ |
632 | #define DEFAULT_MIRROR 0 | 661 | #define DEFAULT_MIRROR 0 |
633 | 662 | ||
634 | /* | 663 | /* |
635 | * This is yucky. We squirrel the mirror_set struct away inside | 664 | * This is yucky. We squirrel the mirror struct away inside |
636 | * bi_next for write buffers. This is safe since the bh | 665 | * bi_next for read/write buffers. This is safe since the bh |
637 | * doesn't get submitted to the lower levels of block layer. | 666 | * doesn't get submitted to the lower levels of block layer. |
638 | */ | 667 | */ |
639 | static struct mirror_set *bio_get_ms(struct bio *bio) | 668 | static struct mirror *bio_get_m(struct bio *bio) |
669 | { | ||
670 | return (struct mirror *) bio->bi_next; | ||
671 | } | ||
672 | |||
673 | static void bio_set_m(struct bio *bio, struct mirror *m) | ||
674 | { | ||
675 | bio->bi_next = (struct bio *) m; | ||
676 | } | ||
677 | |||
678 | static struct mirror *get_default_mirror(struct mirror_set *ms) | ||
640 | { | 679 | { |
641 | return (struct mirror_set *) bio->bi_next; | 680 | return &ms->mirror[atomic_read(&ms->default_mirror)]; |
642 | } | 681 | } |
643 | 682 | ||
644 | static void bio_set_ms(struct bio *bio, struct mirror_set *ms) | 683 | static void set_default_mirror(struct mirror *m) |
645 | { | 684 | { |
646 | bio->bi_next = (struct bio *) ms; | 685 | struct mirror_set *ms = m->ms; |
686 | struct mirror *m0 = &(ms->mirror[0]); | ||
687 | |||
688 | atomic_set(&ms->default_mirror, m - m0); | ||
689 | } | ||
690 | |||
691 | /* fail_mirror | ||
692 | * @m: mirror device to fail | ||
693 | * @error_type: one of the enum's, DM_RAID1_*_ERROR | ||
694 | * | ||
695 | * If errors are being handled, record the type of | ||
696 | * error encountered for this device. If this type | ||
697 | * of error has already been recorded, we can return; | ||
698 | * otherwise, we must signal userspace by triggering | ||
699 | * an event. Additionally, if the device is the | ||
700 | * primary device, we must choose a new primary, but | ||
701 | * only if the mirror is in-sync. | ||
702 | * | ||
703 | * This function must not block. | ||
704 | */ | ||
705 | static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type) | ||
706 | { | ||
707 | struct mirror_set *ms = m->ms; | ||
708 | struct mirror *new; | ||
709 | |||
710 | if (!errors_handled(ms)) | ||
711 | return; | ||
712 | |||
713 | /* | ||
714 | * error_count is used for nothing more than a | ||
715 | * simple way to tell if a device has encountered | ||
716 | * errors. | ||
717 | */ | ||
718 | atomic_inc(&m->error_count); | ||
719 | |||
720 | if (test_and_set_bit(error_type, &m->error_type)) | ||
721 | return; | ||
722 | |||
723 | if (m != get_default_mirror(ms)) | ||
724 | goto out; | ||
725 | |||
726 | if (!ms->in_sync) { | ||
727 | /* | ||
728 | * Better to issue requests to same failing device | ||
729 | * than to risk returning corrupt data. | ||
730 | */ | ||
731 | DMERR("Primary mirror (%s) failed while out-of-sync: " | ||
732 | "Reads may fail.", m->dev->name); | ||
733 | goto out; | ||
734 | } | ||
735 | |||
736 | for (new = ms->mirror; new < ms->mirror + ms->nr_mirrors; new++) | ||
737 | if (!atomic_read(&new->error_count)) { | ||
738 | set_default_mirror(new); | ||
739 | break; | ||
740 | } | ||
741 | |||
742 | if (unlikely(new == ms->mirror + ms->nr_mirrors)) | ||
743 | DMWARN("All sides of mirror have failed."); | ||
744 | |||
745 | out: | ||
746 | schedule_work(&ms->trigger_event); | ||
647 | } | 747 | } |
648 | 748 | ||
649 | /*----------------------------------------------------------------- | 749 | /*----------------------------------------------------------------- |
@@ -656,15 +756,32 @@ static void bio_set_ms(struct bio *bio, struct mirror_set *ms) | |||
656 | static void recovery_complete(int read_err, unsigned int write_err, | 756 | static void recovery_complete(int read_err, unsigned int write_err, |
657 | void *context) | 757 | void *context) |
658 | { | 758 | { |
659 | struct region *reg = (struct region *) context; | 759 | struct region *reg = (struct region *)context; |
760 | struct mirror_set *ms = reg->rh->ms; | ||
761 | int m, bit = 0; | ||
660 | 762 | ||
661 | if (read_err) | 763 | if (read_err) { |
662 | /* Read error means the failure of default mirror. */ | 764 | /* Read error means the failure of default mirror. */ |
663 | DMERR_LIMIT("Unable to read primary mirror during recovery"); | 765 | DMERR_LIMIT("Unable to read primary mirror during recovery"); |
766 | fail_mirror(get_default_mirror(ms), DM_RAID1_SYNC_ERROR); | ||
767 | } | ||
664 | 768 | ||
665 | if (write_err) | 769 | if (write_err) { |
666 | DMERR_LIMIT("Write error during recovery (error = 0x%x)", | 770 | DMERR_LIMIT("Write error during recovery (error = 0x%x)", |
667 | write_err); | 771 | write_err); |
772 | /* | ||
773 | * Bits correspond to devices (excluding default mirror). | ||
774 | * The default mirror cannot change during recovery. | ||
775 | */ | ||
776 | for (m = 0; m < ms->nr_mirrors; m++) { | ||
777 | if (&ms->mirror[m] == get_default_mirror(ms)) | ||
778 | continue; | ||
779 | if (test_bit(bit, &write_err)) | ||
780 | fail_mirror(ms->mirror + m, | ||
781 | DM_RAID1_SYNC_ERROR); | ||
782 | bit++; | ||
783 | } | ||
784 | } | ||
668 | 785 | ||
669 | rh_recovery_end(reg, !(read_err || write_err)); | 786 | rh_recovery_end(reg, !(read_err || write_err)); |
670 | } | 787 | } |
@@ -678,7 +795,7 @@ static int recover(struct mirror_set *ms, struct region *reg) | |||
678 | unsigned long flags = 0; | 795 | unsigned long flags = 0; |
679 | 796 | ||
680 | /* fill in the source */ | 797 | /* fill in the source */ |
681 | m = ms->default_mirror; | 798 | m = get_default_mirror(ms); |
682 | from.bdev = m->dev->bdev; | 799 | from.bdev = m->dev->bdev; |
683 | from.sector = m->offset + region_to_sector(reg->rh, reg->key); | 800 | from.sector = m->offset + region_to_sector(reg->rh, reg->key); |
684 | if (reg->key == (ms->nr_regions - 1)) { | 801 | if (reg->key == (ms->nr_regions - 1)) { |
@@ -694,7 +811,7 @@ static int recover(struct mirror_set *ms, struct region *reg) | |||
694 | 811 | ||
695 | /* fill in the destinations */ | 812 | /* fill in the destinations */ |
696 | for (i = 0, dest = to; i < ms->nr_mirrors; i++) { | 813 | for (i = 0, dest = to; i < ms->nr_mirrors; i++) { |
697 | if (&ms->mirror[i] == ms->default_mirror) | 814 | if (&ms->mirror[i] == get_default_mirror(ms)) |
698 | continue; | 815 | continue; |
699 | 816 | ||
700 | m = ms->mirror + i; | 817 | m = ms->mirror + i; |
@@ -748,17 +865,105 @@ static void do_recovery(struct mirror_set *ms) | |||
748 | *---------------------------------------------------------------*/ | 865 | *---------------------------------------------------------------*/ |
749 | static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector) | 866 | static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector) |
750 | { | 867 | { |
751 | /* FIXME: add read balancing */ | 868 | struct mirror *m = get_default_mirror(ms); |
752 | return ms->default_mirror; | 869 | |
870 | do { | ||
871 | if (likely(!atomic_read(&m->error_count))) | ||
872 | return m; | ||
873 | |||
874 | if (m-- == ms->mirror) | ||
875 | m += ms->nr_mirrors; | ||
876 | } while (m != get_default_mirror(ms)); | ||
877 | |||
878 | return NULL; | ||
879 | } | ||
880 | |||
881 | static int default_ok(struct mirror *m) | ||
882 | { | ||
883 | struct mirror *default_mirror = get_default_mirror(m->ms); | ||
884 | |||
885 | return !atomic_read(&default_mirror->error_count); | ||
886 | } | ||
887 | |||
888 | static int mirror_available(struct mirror_set *ms, struct bio *bio) | ||
889 | { | ||
890 | region_t region = bio_to_region(&ms->rh, bio); | ||
891 | |||
892 | if (ms->rh.log->type->in_sync(ms->rh.log, region, 0)) | ||
893 | return choose_mirror(ms, bio->bi_sector) ? 1 : 0; | ||
894 | |||
895 | return 0; | ||
753 | } | 896 | } |
754 | 897 | ||
755 | /* | 898 | /* |
756 | * remap a buffer to a particular mirror. | 899 | * remap a buffer to a particular mirror. |
757 | */ | 900 | */ |
758 | static void map_bio(struct mirror_set *ms, struct mirror *m, struct bio *bio) | 901 | static sector_t map_sector(struct mirror *m, struct bio *bio) |
902 | { | ||
903 | return m->offset + (bio->bi_sector - m->ms->ti->begin); | ||
904 | } | ||
905 | |||
906 | static void map_bio(struct mirror *m, struct bio *bio) | ||
759 | { | 907 | { |
760 | bio->bi_bdev = m->dev->bdev; | 908 | bio->bi_bdev = m->dev->bdev; |
761 | bio->bi_sector = m->offset + (bio->bi_sector - ms->ti->begin); | 909 | bio->bi_sector = map_sector(m, bio); |
910 | } | ||
911 | |||
912 | static void map_region(struct io_region *io, struct mirror *m, | ||
913 | struct bio *bio) | ||
914 | { | ||
915 | io->bdev = m->dev->bdev; | ||
916 | io->sector = map_sector(m, bio); | ||
917 | io->count = bio->bi_size >> 9; | ||
918 | } | ||
919 | |||
920 | /*----------------------------------------------------------------- | ||
921 | * Reads | ||
922 | *---------------------------------------------------------------*/ | ||
923 | static void read_callback(unsigned long error, void *context) | ||
924 | { | ||
925 | struct bio *bio = context; | ||
926 | struct mirror *m; | ||
927 | |||
928 | m = bio_get_m(bio); | ||
929 | bio_set_m(bio, NULL); | ||
930 | |||
931 | if (likely(!error)) { | ||
932 | bio_endio(bio, 0); | ||
933 | return; | ||
934 | } | ||
935 | |||
936 | fail_mirror(m, DM_RAID1_READ_ERROR); | ||
937 | |||
938 | if (likely(default_ok(m)) || mirror_available(m->ms, bio)) { | ||
939 | DMWARN_LIMIT("Read failure on mirror device %s. " | ||
940 | "Trying alternative device.", | ||
941 | m->dev->name); | ||
942 | queue_bio(m->ms, bio, bio_rw(bio)); | ||
943 | return; | ||
944 | } | ||
945 | |||
946 | DMERR_LIMIT("Read failure on mirror device %s. Failing I/O.", | ||
947 | m->dev->name); | ||
948 | bio_endio(bio, -EIO); | ||
949 | } | ||
950 | |||
951 | /* Asynchronous read. */ | ||
952 | static void read_async_bio(struct mirror *m, struct bio *bio) | ||
953 | { | ||
954 | struct io_region io; | ||
955 | struct dm_io_request io_req = { | ||
956 | .bi_rw = READ, | ||
957 | .mem.type = DM_IO_BVEC, | ||
958 | .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx, | ||
959 | .notify.fn = read_callback, | ||
960 | .notify.context = bio, | ||
961 | .client = m->ms->io_client, | ||
962 | }; | ||
963 | |||
964 | map_region(&io, m, bio); | ||
965 | bio_set_m(bio, m); | ||
966 | (void) dm_io(&io_req, 1, &io, NULL); | ||
762 | } | 967 | } |
763 | 968 | ||
764 | static void do_reads(struct mirror_set *ms, struct bio_list *reads) | 969 | static void do_reads(struct mirror_set *ms, struct bio_list *reads) |
@@ -769,17 +974,20 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads) | |||
769 | 974 | ||
770 | while ((bio = bio_list_pop(reads))) { | 975 | while ((bio = bio_list_pop(reads))) { |
771 | region = bio_to_region(&ms->rh, bio); | 976 | region = bio_to_region(&ms->rh, bio); |
977 | m = get_default_mirror(ms); | ||
772 | 978 | ||
773 | /* | 979 | /* |
774 | * We can only read balance if the region is in sync. | 980 | * We can only read balance if the region is in sync. |
775 | */ | 981 | */ |
776 | if (rh_in_sync(&ms->rh, region, 1)) | 982 | if (likely(rh_in_sync(&ms->rh, region, 1))) |
777 | m = choose_mirror(ms, bio->bi_sector); | 983 | m = choose_mirror(ms, bio->bi_sector); |
778 | else | 984 | else if (m && atomic_read(&m->error_count)) |
779 | m = ms->default_mirror; | 985 | m = NULL; |
780 | 986 | ||
781 | map_bio(ms, m, bio); | 987 | if (likely(m)) |
782 | generic_make_request(bio); | 988 | read_async_bio(m, bio); |
989 | else | ||
990 | bio_endio(bio, -EIO); | ||
783 | } | 991 | } |
784 | } | 992 | } |
785 | 993 | ||
@@ -793,15 +1001,70 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads) | |||
793 | * RECOVERING: delay the io until recovery completes | 1001 | * RECOVERING: delay the io until recovery completes |
794 | * NOSYNC: increment pending, just write to the default mirror | 1002 | * NOSYNC: increment pending, just write to the default mirror |
795 | *---------------------------------------------------------------*/ | 1003 | *---------------------------------------------------------------*/ |
1004 | |||
1005 | /* __bio_mark_nosync | ||
1006 | * @ms | ||
1007 | * @bio | ||
1008 | * @done | ||
1009 | * @error | ||
1010 | * | ||
1011 | * The bio was written on some mirror(s) but failed on other mirror(s). | ||
1012 | * We can successfully endio the bio but should avoid the region being | ||
1013 | * marked clean by setting the state RH_NOSYNC. | ||
1014 | * | ||
1015 | * This function is _not_ safe in interrupt context! | ||
1016 | */ | ||
1017 | static void __bio_mark_nosync(struct mirror_set *ms, | ||
1018 | struct bio *bio, unsigned done, int error) | ||
1019 | { | ||
1020 | unsigned long flags; | ||
1021 | struct region_hash *rh = &ms->rh; | ||
1022 | struct dirty_log *log = ms->rh.log; | ||
1023 | struct region *reg; | ||
1024 | region_t region = bio_to_region(rh, bio); | ||
1025 | int recovering = 0; | ||
1026 | |||
1027 | /* We must inform the log that the sync count has changed. */ | ||
1028 | log->type->set_region_sync(log, region, 0); | ||
1029 | ms->in_sync = 0; | ||
1030 | |||
1031 | read_lock(&rh->hash_lock); | ||
1032 | reg = __rh_find(rh, region); | ||
1033 | read_unlock(&rh->hash_lock); | ||
1034 | |||
1035 | /* region hash entry should exist because write was in-flight */ | ||
1036 | BUG_ON(!reg); | ||
1037 | BUG_ON(!list_empty(®->list)); | ||
1038 | |||
1039 | spin_lock_irqsave(&rh->region_lock, flags); | ||
1040 | /* | ||
1041 | * Possible cases: | ||
1042 | * 1) RH_DIRTY | ||
1043 | * 2) RH_NOSYNC: was dirty, other preceeding writes failed | ||
1044 | * 3) RH_RECOVERING: flushing pending writes | ||
1045 | * Either case, the region should have not been connected to list. | ||
1046 | */ | ||
1047 | recovering = (reg->state == RH_RECOVERING); | ||
1048 | reg->state = RH_NOSYNC; | ||
1049 | BUG_ON(!list_empty(®->list)); | ||
1050 | spin_unlock_irqrestore(&rh->region_lock, flags); | ||
1051 | |||
1052 | bio_endio(bio, error); | ||
1053 | if (recovering) | ||
1054 | complete_resync_work(reg, 0); | ||
1055 | } | ||
1056 | |||
796 | static void write_callback(unsigned long error, void *context) | 1057 | static void write_callback(unsigned long error, void *context) |
797 | { | 1058 | { |
798 | unsigned int i; | 1059 | unsigned i, ret = 0; |
799 | int uptodate = 1; | ||
800 | struct bio *bio = (struct bio *) context; | 1060 | struct bio *bio = (struct bio *) context; |
801 | struct mirror_set *ms; | 1061 | struct mirror_set *ms; |
1062 | int uptodate = 0; | ||
1063 | int should_wake = 0; | ||
1064 | unsigned long flags; | ||
802 | 1065 | ||
803 | ms = bio_get_ms(bio); | 1066 | ms = bio_get_m(bio)->ms; |
804 | bio_set_ms(bio, NULL); | 1067 | bio_set_m(bio, NULL); |
805 | 1068 | ||
806 | /* | 1069 | /* |
807 | * NOTE: We don't decrement the pending count here, | 1070 | * NOTE: We don't decrement the pending count here, |
@@ -809,26 +1072,42 @@ static void write_callback(unsigned long error, void *context) | |||
809 | * This way we handle both writes to SYNC and NOSYNC | 1072 | * This way we handle both writes to SYNC and NOSYNC |
810 | * regions with the same code. | 1073 | * regions with the same code. |
811 | */ | 1074 | */ |
1075 | if (likely(!error)) | ||
1076 | goto out; | ||
1077 | |||
1078 | for (i = 0; i < ms->nr_mirrors; i++) | ||
1079 | if (test_bit(i, &error)) | ||
1080 | fail_mirror(ms->mirror + i, DM_RAID1_WRITE_ERROR); | ||
1081 | else | ||
1082 | uptodate = 1; | ||
812 | 1083 | ||
813 | if (error) { | 1084 | if (unlikely(!uptodate)) { |
1085 | DMERR("All replicated volumes dead, failing I/O"); | ||
1086 | /* None of the writes succeeded, fail the I/O. */ | ||
1087 | ret = -EIO; | ||
1088 | } else if (errors_handled(ms)) { | ||
814 | /* | 1089 | /* |
815 | * only error the io if all mirrors failed. | 1090 | * Need to raise event. Since raising |
816 | * FIXME: bogus | 1091 | * events can block, we need to do it in |
1092 | * the main thread. | ||
817 | */ | 1093 | */ |
818 | uptodate = 0; | 1094 | spin_lock_irqsave(&ms->lock, flags); |
819 | for (i = 0; i < ms->nr_mirrors; i++) | 1095 | if (!ms->failures.head) |
820 | if (!test_bit(i, &error)) { | 1096 | should_wake = 1; |
821 | uptodate = 1; | 1097 | bio_list_add(&ms->failures, bio); |
822 | break; | 1098 | spin_unlock_irqrestore(&ms->lock, flags); |
823 | } | 1099 | if (should_wake) |
1100 | wake(ms); | ||
1101 | return; | ||
824 | } | 1102 | } |
825 | bio_endio(bio, 0); | 1103 | out: |
1104 | bio_endio(bio, ret); | ||
826 | } | 1105 | } |
827 | 1106 | ||
828 | static void do_write(struct mirror_set *ms, struct bio *bio) | 1107 | static void do_write(struct mirror_set *ms, struct bio *bio) |
829 | { | 1108 | { |
830 | unsigned int i; | 1109 | unsigned int i; |
831 | struct io_region io[KCOPYD_MAX_REGIONS+1]; | 1110 | struct io_region io[ms->nr_mirrors], *dest = io; |
832 | struct mirror *m; | 1111 | struct mirror *m; |
833 | struct dm_io_request io_req = { | 1112 | struct dm_io_request io_req = { |
834 | .bi_rw = WRITE, | 1113 | .bi_rw = WRITE, |
@@ -839,15 +1118,14 @@ static void do_write(struct mirror_set *ms, struct bio *bio) | |||
839 | .client = ms->io_client, | 1118 | .client = ms->io_client, |
840 | }; | 1119 | }; |
841 | 1120 | ||
842 | for (i = 0; i < ms->nr_mirrors; i++) { | 1121 | for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++) |
843 | m = ms->mirror + i; | 1122 | map_region(dest++, m, bio); |
844 | |||
845 | io[i].bdev = m->dev->bdev; | ||
846 | io[i].sector = m->offset + (bio->bi_sector - ms->ti->begin); | ||
847 | io[i].count = bio->bi_size >> 9; | ||
848 | } | ||
849 | 1123 | ||
850 | bio_set_ms(bio, ms); | 1124 | /* |
1125 | * Use default mirror because we only need it to retrieve the reference | ||
1126 | * to the mirror set in write_callback(). | ||
1127 | */ | ||
1128 | bio_set_m(bio, get_default_mirror(ms)); | ||
851 | 1129 | ||
852 | (void) dm_io(&io_req, ms->nr_mirrors, io, NULL); | 1130 | (void) dm_io(&io_req, ms->nr_mirrors, io, NULL); |
853 | } | 1131 | } |
@@ -900,43 +1178,125 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) | |||
900 | /* | 1178 | /* |
901 | * Dispatch io. | 1179 | * Dispatch io. |
902 | */ | 1180 | */ |
903 | if (unlikely(ms->log_failure)) | 1181 | if (unlikely(ms->log_failure)) { |
1182 | spin_lock_irq(&ms->lock); | ||
1183 | bio_list_merge(&ms->failures, &sync); | ||
1184 | spin_unlock_irq(&ms->lock); | ||
1185 | } else | ||
904 | while ((bio = bio_list_pop(&sync))) | 1186 | while ((bio = bio_list_pop(&sync))) |
905 | bio_endio(bio, -EIO); | 1187 | do_write(ms, bio); |
906 | else while ((bio = bio_list_pop(&sync))) | ||
907 | do_write(ms, bio); | ||
908 | 1188 | ||
909 | while ((bio = bio_list_pop(&recover))) | 1189 | while ((bio = bio_list_pop(&recover))) |
910 | rh_delay(&ms->rh, bio); | 1190 | rh_delay(&ms->rh, bio); |
911 | 1191 | ||
912 | while ((bio = bio_list_pop(&nosync))) { | 1192 | while ((bio = bio_list_pop(&nosync))) { |
913 | map_bio(ms, ms->default_mirror, bio); | 1193 | map_bio(get_default_mirror(ms), bio); |
914 | generic_make_request(bio); | 1194 | generic_make_request(bio); |
915 | } | 1195 | } |
916 | } | 1196 | } |
917 | 1197 | ||
1198 | static void do_failures(struct mirror_set *ms, struct bio_list *failures) | ||
1199 | { | ||
1200 | struct bio *bio; | ||
1201 | |||
1202 | if (!failures->head) | ||
1203 | return; | ||
1204 | |||
1205 | if (!ms->log_failure) { | ||
1206 | while ((bio = bio_list_pop(failures))) | ||
1207 | __bio_mark_nosync(ms, bio, bio->bi_size, 0); | ||
1208 | return; | ||
1209 | } | ||
1210 | |||
1211 | /* | ||
1212 | * If the log has failed, unattempted writes are being | ||
1213 | * put on the failures list. We can't issue those writes | ||
1214 | * until a log has been marked, so we must store them. | ||
1215 | * | ||
1216 | * If a 'noflush' suspend is in progress, we can requeue | ||
1217 | * the I/O's to the core. This give userspace a chance | ||
1218 | * to reconfigure the mirror, at which point the core | ||
1219 | * will reissue the writes. If the 'noflush' flag is | ||
1220 | * not set, we have no choice but to return errors. | ||
1221 | * | ||
1222 | * Some writes on the failures list may have been | ||
1223 | * submitted before the log failure and represent a | ||
1224 | * failure to write to one of the devices. It is ok | ||
1225 | * for us to treat them the same and requeue them | ||
1226 | * as well. | ||
1227 | */ | ||
1228 | if (dm_noflush_suspending(ms->ti)) { | ||
1229 | while ((bio = bio_list_pop(failures))) | ||
1230 | bio_endio(bio, DM_ENDIO_REQUEUE); | ||
1231 | return; | ||
1232 | } | ||
1233 | |||
1234 | if (atomic_read(&ms->suspend)) { | ||
1235 | while ((bio = bio_list_pop(failures))) | ||
1236 | bio_endio(bio, -EIO); | ||
1237 | return; | ||
1238 | } | ||
1239 | |||
1240 | spin_lock_irq(&ms->lock); | ||
1241 | bio_list_merge(&ms->failures, failures); | ||
1242 | spin_unlock_irq(&ms->lock); | ||
1243 | |||
1244 | wake(ms); | ||
1245 | } | ||
1246 | |||
1247 | static void trigger_event(struct work_struct *work) | ||
1248 | { | ||
1249 | struct mirror_set *ms = | ||
1250 | container_of(work, struct mirror_set, trigger_event); | ||
1251 | |||
1252 | dm_table_event(ms->ti->table); | ||
1253 | } | ||
1254 | |||
918 | /*----------------------------------------------------------------- | 1255 | /*----------------------------------------------------------------- |
919 | * kmirrord | 1256 | * kmirrord |
920 | *---------------------------------------------------------------*/ | 1257 | *---------------------------------------------------------------*/ |
921 | static void do_mirror(struct work_struct *work) | 1258 | static int _do_mirror(struct work_struct *work) |
922 | { | 1259 | { |
923 | struct mirror_set *ms =container_of(work, struct mirror_set, | 1260 | struct mirror_set *ms =container_of(work, struct mirror_set, |
924 | kmirrord_work); | 1261 | kmirrord_work); |
925 | struct bio_list reads, writes; | 1262 | struct bio_list reads, writes, failures; |
1263 | unsigned long flags; | ||
926 | 1264 | ||
927 | spin_lock(&ms->lock); | 1265 | spin_lock_irqsave(&ms->lock, flags); |
928 | reads = ms->reads; | 1266 | reads = ms->reads; |
929 | writes = ms->writes; | 1267 | writes = ms->writes; |
1268 | failures = ms->failures; | ||
930 | bio_list_init(&ms->reads); | 1269 | bio_list_init(&ms->reads); |
931 | bio_list_init(&ms->writes); | 1270 | bio_list_init(&ms->writes); |
932 | spin_unlock(&ms->lock); | 1271 | bio_list_init(&ms->failures); |
1272 | spin_unlock_irqrestore(&ms->lock, flags); | ||
933 | 1273 | ||
934 | rh_update_states(&ms->rh); | 1274 | rh_update_states(&ms->rh); |
935 | do_recovery(ms); | 1275 | do_recovery(ms); |
936 | do_reads(ms, &reads); | 1276 | do_reads(ms, &reads); |
937 | do_writes(ms, &writes); | 1277 | do_writes(ms, &writes); |
1278 | do_failures(ms, &failures); | ||
1279 | |||
1280 | return (ms->failures.head) ? 1 : 0; | ||
938 | } | 1281 | } |
939 | 1282 | ||
1283 | static void do_mirror(struct work_struct *work) | ||
1284 | { | ||
1285 | /* | ||
1286 | * If _do_mirror returns 1, we give it | ||
1287 | * another shot. This helps for cases like | ||
1288 | * 'suspend' where we call flush_workqueue | ||
1289 | * and expect all work to be finished. If | ||
1290 | * a failure happens during a suspend, we | ||
1291 | * couldn't issue a 'wake' because it would | ||
1292 | * not be honored. Therefore, we return '1' | ||
1293 | * from _do_mirror, and retry here. | ||
1294 | */ | ||
1295 | while (_do_mirror(work)) | ||
1296 | schedule(); | ||
1297 | } | ||
1298 | |||
1299 | |||
940 | /*----------------------------------------------------------------- | 1300 | /*----------------------------------------------------------------- |
941 | * Target functions | 1301 | * Target functions |
942 | *---------------------------------------------------------------*/ | 1302 | *---------------------------------------------------------------*/ |
@@ -965,11 +1325,23 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors, | |||
965 | ms->nr_mirrors = nr_mirrors; | 1325 | ms->nr_mirrors = nr_mirrors; |
966 | ms->nr_regions = dm_sector_div_up(ti->len, region_size); | 1326 | ms->nr_regions = dm_sector_div_up(ti->len, region_size); |
967 | ms->in_sync = 0; | 1327 | ms->in_sync = 0; |
968 | ms->default_mirror = &ms->mirror[DEFAULT_MIRROR]; | 1328 | ms->log_failure = 0; |
1329 | atomic_set(&ms->suspend, 0); | ||
1330 | atomic_set(&ms->default_mirror, DEFAULT_MIRROR); | ||
1331 | |||
1332 | len = sizeof(struct dm_raid1_read_record); | ||
1333 | ms->read_record_pool = mempool_create_kmalloc_pool(MIN_READ_RECORDS, | ||
1334 | len); | ||
1335 | if (!ms->read_record_pool) { | ||
1336 | ti->error = "Error creating mirror read_record_pool"; | ||
1337 | kfree(ms); | ||
1338 | return NULL; | ||
1339 | } | ||
969 | 1340 | ||
970 | ms->io_client = dm_io_client_create(DM_IO_PAGES); | 1341 | ms->io_client = dm_io_client_create(DM_IO_PAGES); |
971 | if (IS_ERR(ms->io_client)) { | 1342 | if (IS_ERR(ms->io_client)) { |
972 | ti->error = "Error creating dm_io client"; | 1343 | ti->error = "Error creating dm_io client"; |
1344 | mempool_destroy(ms->read_record_pool); | ||
973 | kfree(ms); | 1345 | kfree(ms); |
974 | return NULL; | 1346 | return NULL; |
975 | } | 1347 | } |
@@ -977,6 +1349,7 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors, | |||
977 | if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) { | 1349 | if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) { |
978 | ti->error = "Error creating dirty region hash"; | 1350 | ti->error = "Error creating dirty region hash"; |
979 | dm_io_client_destroy(ms->io_client); | 1351 | dm_io_client_destroy(ms->io_client); |
1352 | mempool_destroy(ms->read_record_pool); | ||
980 | kfree(ms); | 1353 | kfree(ms); |
981 | return NULL; | 1354 | return NULL; |
982 | } | 1355 | } |
@@ -992,6 +1365,7 @@ static void free_context(struct mirror_set *ms, struct dm_target *ti, | |||
992 | 1365 | ||
993 | dm_io_client_destroy(ms->io_client); | 1366 | dm_io_client_destroy(ms->io_client); |
994 | rh_exit(&ms->rh); | 1367 | rh_exit(&ms->rh); |
1368 | mempool_destroy(ms->read_record_pool); | ||
995 | kfree(ms); | 1369 | kfree(ms); |
996 | } | 1370 | } |
997 | 1371 | ||
@@ -1019,6 +1393,8 @@ static int get_mirror(struct mirror_set *ms, struct dm_target *ti, | |||
1019 | } | 1393 | } |
1020 | 1394 | ||
1021 | ms->mirror[mirror].ms = ms; | 1395 | ms->mirror[mirror].ms = ms; |
1396 | atomic_set(&(ms->mirror[mirror].error_count), 0); | ||
1397 | ms->mirror[mirror].error_type = 0; | ||
1022 | ms->mirror[mirror].offset = offset; | 1398 | ms->mirror[mirror].offset = offset; |
1023 | 1399 | ||
1024 | return 0; | 1400 | return 0; |
@@ -1171,6 +1547,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1171 | goto err_free_context; | 1547 | goto err_free_context; |
1172 | } | 1548 | } |
1173 | INIT_WORK(&ms->kmirrord_work, do_mirror); | 1549 | INIT_WORK(&ms->kmirrord_work, do_mirror); |
1550 | INIT_WORK(&ms->trigger_event, trigger_event); | ||
1174 | 1551 | ||
1175 | r = parse_features(ms, argc, argv, &args_used); | 1552 | r = parse_features(ms, argc, argv, &args_used); |
1176 | if (r) | 1553 | if (r) |
@@ -1220,14 +1597,15 @@ static void mirror_dtr(struct dm_target *ti) | |||
1220 | 1597 | ||
1221 | static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw) | 1598 | static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw) |
1222 | { | 1599 | { |
1600 | unsigned long flags; | ||
1223 | int should_wake = 0; | 1601 | int should_wake = 0; |
1224 | struct bio_list *bl; | 1602 | struct bio_list *bl; |
1225 | 1603 | ||
1226 | bl = (rw == WRITE) ? &ms->writes : &ms->reads; | 1604 | bl = (rw == WRITE) ? &ms->writes : &ms->reads; |
1227 | spin_lock(&ms->lock); | 1605 | spin_lock_irqsave(&ms->lock, flags); |
1228 | should_wake = !(bl->head); | 1606 | should_wake = !(bl->head); |
1229 | bio_list_add(bl, bio); | 1607 | bio_list_add(bl, bio); |
1230 | spin_unlock(&ms->lock); | 1608 | spin_unlock_irqrestore(&ms->lock, flags); |
1231 | 1609 | ||
1232 | if (should_wake) | 1610 | if (should_wake) |
1233 | wake(ms); | 1611 | wake(ms); |
@@ -1242,10 +1620,11 @@ static int mirror_map(struct dm_target *ti, struct bio *bio, | |||
1242 | int r, rw = bio_rw(bio); | 1620 | int r, rw = bio_rw(bio); |
1243 | struct mirror *m; | 1621 | struct mirror *m; |
1244 | struct mirror_set *ms = ti->private; | 1622 | struct mirror_set *ms = ti->private; |
1245 | 1623 | struct dm_raid1_read_record *read_record = NULL; | |
1246 | map_context->ll = bio_to_region(&ms->rh, bio); | ||
1247 | 1624 | ||
1248 | if (rw == WRITE) { | 1625 | if (rw == WRITE) { |
1626 | /* Save region for mirror_end_io() handler */ | ||
1627 | map_context->ll = bio_to_region(&ms->rh, bio); | ||
1249 | queue_bio(ms, bio, rw); | 1628 | queue_bio(ms, bio, rw); |
1250 | return DM_MAPIO_SUBMITTED; | 1629 | return DM_MAPIO_SUBMITTED; |
1251 | } | 1630 | } |
@@ -1255,28 +1634,34 @@ static int mirror_map(struct dm_target *ti, struct bio *bio, | |||
1255 | if (r < 0 && r != -EWOULDBLOCK) | 1634 | if (r < 0 && r != -EWOULDBLOCK) |
1256 | return r; | 1635 | return r; |
1257 | 1636 | ||
1258 | if (r == -EWOULDBLOCK) /* FIXME: ugly */ | ||
1259 | r = DM_MAPIO_SUBMITTED; | ||
1260 | |||
1261 | /* | 1637 | /* |
1262 | * We don't want to fast track a recovery just for a read | 1638 | * If region is not in-sync queue the bio. |
1263 | * ahead. So we just let it silently fail. | ||
1264 | * FIXME: get rid of this. | ||
1265 | */ | 1639 | */ |
1266 | if (!r && rw == READA) | 1640 | if (!r || (r == -EWOULDBLOCK)) { |
1267 | return -EIO; | 1641 | if (rw == READA) |
1642 | return -EWOULDBLOCK; | ||
1268 | 1643 | ||
1269 | if (!r) { | ||
1270 | /* Pass this io over to the daemon */ | ||
1271 | queue_bio(ms, bio, rw); | 1644 | queue_bio(ms, bio, rw); |
1272 | return DM_MAPIO_SUBMITTED; | 1645 | return DM_MAPIO_SUBMITTED; |
1273 | } | 1646 | } |
1274 | 1647 | ||
1648 | /* | ||
1649 | * The region is in-sync and we can perform reads directly. | ||
1650 | * Store enough information so we can retry if it fails. | ||
1651 | */ | ||
1275 | m = choose_mirror(ms, bio->bi_sector); | 1652 | m = choose_mirror(ms, bio->bi_sector); |
1276 | if (!m) | 1653 | if (unlikely(!m)) |
1277 | return -EIO; | 1654 | return -EIO; |
1278 | 1655 | ||
1279 | map_bio(ms, m, bio); | 1656 | read_record = mempool_alloc(ms->read_record_pool, GFP_NOIO); |
1657 | if (likely(read_record)) { | ||
1658 | dm_bio_record(&read_record->details, bio); | ||
1659 | map_context->ptr = read_record; | ||
1660 | read_record->m = m; | ||
1661 | } | ||
1662 | |||
1663 | map_bio(m, bio); | ||
1664 | |||
1280 | return DM_MAPIO_REMAPPED; | 1665 | return DM_MAPIO_REMAPPED; |
1281 | } | 1666 | } |
1282 | 1667 | ||
@@ -1285,71 +1670,173 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, | |||
1285 | { | 1670 | { |
1286 | int rw = bio_rw(bio); | 1671 | int rw = bio_rw(bio); |
1287 | struct mirror_set *ms = (struct mirror_set *) ti->private; | 1672 | struct mirror_set *ms = (struct mirror_set *) ti->private; |
1288 | region_t region = map_context->ll; | 1673 | struct mirror *m = NULL; |
1674 | struct dm_bio_details *bd = NULL; | ||
1675 | struct dm_raid1_read_record *read_record = map_context->ptr; | ||
1289 | 1676 | ||
1290 | /* | 1677 | /* |
1291 | * We need to dec pending if this was a write. | 1678 | * We need to dec pending if this was a write. |
1292 | */ | 1679 | */ |
1293 | if (rw == WRITE) | 1680 | if (rw == WRITE) { |
1294 | rh_dec(&ms->rh, region); | 1681 | rh_dec(&ms->rh, map_context->ll); |
1682 | return error; | ||
1683 | } | ||
1295 | 1684 | ||
1296 | return 0; | 1685 | if (error == -EOPNOTSUPP) |
1686 | goto out; | ||
1687 | |||
1688 | if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio)) | ||
1689 | goto out; | ||
1690 | |||
1691 | if (unlikely(error)) { | ||
1692 | if (!read_record) { | ||
1693 | /* | ||
1694 | * There wasn't enough memory to record necessary | ||
1695 | * information for a retry or there was no other | ||
1696 | * mirror in-sync. | ||
1697 | */ | ||
1698 | DMERR_LIMIT("Mirror read failed from %s.", | ||
1699 | m->dev->name); | ||
1700 | return -EIO; | ||
1701 | } | ||
1702 | DMERR("Mirror read failed from %s. Trying alternative device.", | ||
1703 | m->dev->name); | ||
1704 | |||
1705 | m = read_record->m; | ||
1706 | fail_mirror(m, DM_RAID1_READ_ERROR); | ||
1707 | |||
1708 | /* | ||
1709 | * A failed read is requeued for another attempt using an intact | ||
1710 | * mirror. | ||
1711 | */ | ||
1712 | if (default_ok(m) || mirror_available(ms, bio)) { | ||
1713 | bd = &read_record->details; | ||
1714 | |||
1715 | dm_bio_restore(bd, bio); | ||
1716 | mempool_free(read_record, ms->read_record_pool); | ||
1717 | map_context->ptr = NULL; | ||
1718 | queue_bio(ms, bio, rw); | ||
1719 | return 1; | ||
1720 | } | ||
1721 | DMERR("All replicated volumes dead, failing I/O"); | ||
1722 | } | ||
1723 | |||
1724 | out: | ||
1725 | if (read_record) { | ||
1726 | mempool_free(read_record, ms->read_record_pool); | ||
1727 | map_context->ptr = NULL; | ||
1728 | } | ||
1729 | |||
1730 | return error; | ||
1297 | } | 1731 | } |
1298 | 1732 | ||
1299 | static void mirror_postsuspend(struct dm_target *ti) | 1733 | static void mirror_presuspend(struct dm_target *ti) |
1300 | { | 1734 | { |
1301 | struct mirror_set *ms = (struct mirror_set *) ti->private; | 1735 | struct mirror_set *ms = (struct mirror_set *) ti->private; |
1302 | struct dirty_log *log = ms->rh.log; | 1736 | struct dirty_log *log = ms->rh.log; |
1303 | 1737 | ||
1738 | atomic_set(&ms->suspend, 1); | ||
1739 | |||
1740 | /* | ||
1741 | * We must finish up all the work that we've | ||
1742 | * generated (i.e. recovery work). | ||
1743 | */ | ||
1304 | rh_stop_recovery(&ms->rh); | 1744 | rh_stop_recovery(&ms->rh); |
1305 | 1745 | ||
1306 | /* Wait for all I/O we generated to complete */ | ||
1307 | wait_event(_kmirrord_recovery_stopped, | 1746 | wait_event(_kmirrord_recovery_stopped, |
1308 | !atomic_read(&ms->rh.recovery_in_flight)); | 1747 | !atomic_read(&ms->rh.recovery_in_flight)); |
1309 | 1748 | ||
1749 | if (log->type->presuspend && log->type->presuspend(log)) | ||
1750 | /* FIXME: need better error handling */ | ||
1751 | DMWARN("log presuspend failed"); | ||
1752 | |||
1753 | /* | ||
1754 | * Now that recovery is complete/stopped and the | ||
1755 | * delayed bios are queued, we need to wait for | ||
1756 | * the worker thread to complete. This way, | ||
1757 | * we know that all of our I/O has been pushed. | ||
1758 | */ | ||
1759 | flush_workqueue(ms->kmirrord_wq); | ||
1760 | } | ||
1761 | |||
1762 | static void mirror_postsuspend(struct dm_target *ti) | ||
1763 | { | ||
1764 | struct mirror_set *ms = ti->private; | ||
1765 | struct dirty_log *log = ms->rh.log; | ||
1766 | |||
1310 | if (log->type->postsuspend && log->type->postsuspend(log)) | 1767 | if (log->type->postsuspend && log->type->postsuspend(log)) |
1311 | /* FIXME: need better error handling */ | 1768 | /* FIXME: need better error handling */ |
1312 | DMWARN("log suspend failed"); | 1769 | DMWARN("log postsuspend failed"); |
1313 | } | 1770 | } |
1314 | 1771 | ||
1315 | static void mirror_resume(struct dm_target *ti) | 1772 | static void mirror_resume(struct dm_target *ti) |
1316 | { | 1773 | { |
1317 | struct mirror_set *ms = (struct mirror_set *) ti->private; | 1774 | struct mirror_set *ms = ti->private; |
1318 | struct dirty_log *log = ms->rh.log; | 1775 | struct dirty_log *log = ms->rh.log; |
1776 | |||
1777 | atomic_set(&ms->suspend, 0); | ||
1319 | if (log->type->resume && log->type->resume(log)) | 1778 | if (log->type->resume && log->type->resume(log)) |
1320 | /* FIXME: need better error handling */ | 1779 | /* FIXME: need better error handling */ |
1321 | DMWARN("log resume failed"); | 1780 | DMWARN("log resume failed"); |
1322 | rh_start_recovery(&ms->rh); | 1781 | rh_start_recovery(&ms->rh); |
1323 | } | 1782 | } |
1324 | 1783 | ||
1784 | /* | ||
1785 | * device_status_char | ||
1786 | * @m: mirror device/leg we want the status of | ||
1787 | * | ||
1788 | * We return one character representing the most severe error | ||
1789 | * we have encountered. | ||
1790 | * A => Alive - No failures | ||
1791 | * D => Dead - A write failure occurred leaving mirror out-of-sync | ||
1792 | * S => Sync - A sychronization failure occurred, mirror out-of-sync | ||
1793 | * R => Read - A read failure occurred, mirror data unaffected | ||
1794 | * | ||
1795 | * Returns: <char> | ||
1796 | */ | ||
1797 | static char device_status_char(struct mirror *m) | ||
1798 | { | ||
1799 | if (!atomic_read(&(m->error_count))) | ||
1800 | return 'A'; | ||
1801 | |||
1802 | return (test_bit(DM_RAID1_WRITE_ERROR, &(m->error_type))) ? 'D' : | ||
1803 | (test_bit(DM_RAID1_SYNC_ERROR, &(m->error_type))) ? 'S' : | ||
1804 | (test_bit(DM_RAID1_READ_ERROR, &(m->error_type))) ? 'R' : 'U'; | ||
1805 | } | ||
1806 | |||
1807 | |||
1325 | static int mirror_status(struct dm_target *ti, status_type_t type, | 1808 | static int mirror_status(struct dm_target *ti, status_type_t type, |
1326 | char *result, unsigned int maxlen) | 1809 | char *result, unsigned int maxlen) |
1327 | { | 1810 | { |
1328 | unsigned int m, sz = 0; | 1811 | unsigned int m, sz = 0; |
1329 | struct mirror_set *ms = (struct mirror_set *) ti->private; | 1812 | struct mirror_set *ms = (struct mirror_set *) ti->private; |
1813 | struct dirty_log *log = ms->rh.log; | ||
1814 | char buffer[ms->nr_mirrors + 1]; | ||
1330 | 1815 | ||
1331 | switch (type) { | 1816 | switch (type) { |
1332 | case STATUSTYPE_INFO: | 1817 | case STATUSTYPE_INFO: |
1333 | DMEMIT("%d ", ms->nr_mirrors); | 1818 | DMEMIT("%d ", ms->nr_mirrors); |
1334 | for (m = 0; m < ms->nr_mirrors; m++) | 1819 | for (m = 0; m < ms->nr_mirrors; m++) { |
1335 | DMEMIT("%s ", ms->mirror[m].dev->name); | 1820 | DMEMIT("%s ", ms->mirror[m].dev->name); |
1821 | buffer[m] = device_status_char(&(ms->mirror[m])); | ||
1822 | } | ||
1823 | buffer[m] = '\0'; | ||
1336 | 1824 | ||
1337 | DMEMIT("%llu/%llu 0 ", | 1825 | DMEMIT("%llu/%llu 1 %s ", |
1338 | (unsigned long long)ms->rh.log->type-> | 1826 | (unsigned long long)log->type->get_sync_count(ms->rh.log), |
1339 | get_sync_count(ms->rh.log), | 1827 | (unsigned long long)ms->nr_regions, buffer); |
1340 | (unsigned long long)ms->nr_regions); | ||
1341 | 1828 | ||
1342 | sz += ms->rh.log->type->status(ms->rh.log, type, result+sz, maxlen-sz); | 1829 | sz += log->type->status(ms->rh.log, type, result+sz, maxlen-sz); |
1343 | 1830 | ||
1344 | break; | 1831 | break; |
1345 | 1832 | ||
1346 | case STATUSTYPE_TABLE: | 1833 | case STATUSTYPE_TABLE: |
1347 | sz = ms->rh.log->type->status(ms->rh.log, type, result, maxlen); | 1834 | sz = log->type->status(ms->rh.log, type, result, maxlen); |
1348 | 1835 | ||
1349 | DMEMIT("%d", ms->nr_mirrors); | 1836 | DMEMIT("%d", ms->nr_mirrors); |
1350 | for (m = 0; m < ms->nr_mirrors; m++) | 1837 | for (m = 0; m < ms->nr_mirrors; m++) |
1351 | DMEMIT(" %s %llu", ms->mirror[m].dev->name, | 1838 | DMEMIT(" %s %llu", ms->mirror[m].dev->name, |
1352 | (unsigned long long)ms->mirror[m].offset); | 1839 | (unsigned long long)ms->mirror[m].offset); |
1353 | 1840 | ||
1354 | if (ms->features & DM_RAID1_HANDLE_ERRORS) | 1841 | if (ms->features & DM_RAID1_HANDLE_ERRORS) |
1355 | DMEMIT(" 1 handle_errors"); | 1842 | DMEMIT(" 1 handle_errors"); |
@@ -1360,12 +1847,13 @@ static int mirror_status(struct dm_target *ti, status_type_t type, | |||
1360 | 1847 | ||
1361 | static struct target_type mirror_target = { | 1848 | static struct target_type mirror_target = { |
1362 | .name = "mirror", | 1849 | .name = "mirror", |
1363 | .version = {1, 0, 3}, | 1850 | .version = {1, 0, 20}, |
1364 | .module = THIS_MODULE, | 1851 | .module = THIS_MODULE, |
1365 | .ctr = mirror_ctr, | 1852 | .ctr = mirror_ctr, |
1366 | .dtr = mirror_dtr, | 1853 | .dtr = mirror_dtr, |
1367 | .map = mirror_map, | 1854 | .map = mirror_map, |
1368 | .end_io = mirror_end_io, | 1855 | .end_io = mirror_end_io, |
1856 | .presuspend = mirror_presuspend, | ||
1369 | .postsuspend = mirror_postsuspend, | 1857 | .postsuspend = mirror_postsuspend, |
1370 | .resume = mirror_resume, | 1858 | .resume = mirror_resume, |
1371 | .status = mirror_status, | 1859 | .status = mirror_status, |
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index cee16fadd9ee..ae24eab8cd81 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c | |||
@@ -213,11 +213,15 @@ static void unregister_snapshot(struct dm_snapshot *s) | |||
213 | 213 | ||
214 | /* | 214 | /* |
215 | * Implementation of the exception hash tables. | 215 | * Implementation of the exception hash tables. |
216 | * The lowest hash_shift bits of the chunk number are ignored, allowing | ||
217 | * some consecutive chunks to be grouped together. | ||
216 | */ | 218 | */ |
217 | static int init_exception_table(struct exception_table *et, uint32_t size) | 219 | static int init_exception_table(struct exception_table *et, uint32_t size, |
220 | unsigned hash_shift) | ||
218 | { | 221 | { |
219 | unsigned int i; | 222 | unsigned int i; |
220 | 223 | ||
224 | et->hash_shift = hash_shift; | ||
221 | et->hash_mask = size - 1; | 225 | et->hash_mask = size - 1; |
222 | et->table = dm_vcalloc(size, sizeof(struct list_head)); | 226 | et->table = dm_vcalloc(size, sizeof(struct list_head)); |
223 | if (!et->table) | 227 | if (!et->table) |
@@ -248,7 +252,7 @@ static void exit_exception_table(struct exception_table *et, struct kmem_cache * | |||
248 | 252 | ||
249 | static uint32_t exception_hash(struct exception_table *et, chunk_t chunk) | 253 | static uint32_t exception_hash(struct exception_table *et, chunk_t chunk) |
250 | { | 254 | { |
251 | return chunk & et->hash_mask; | 255 | return (chunk >> et->hash_shift) & et->hash_mask; |
252 | } | 256 | } |
253 | 257 | ||
254 | static void insert_exception(struct exception_table *eh, | 258 | static void insert_exception(struct exception_table *eh, |
@@ -275,7 +279,8 @@ static struct dm_snap_exception *lookup_exception(struct exception_table *et, | |||
275 | 279 | ||
276 | slot = &et->table[exception_hash(et, chunk)]; | 280 | slot = &et->table[exception_hash(et, chunk)]; |
277 | list_for_each_entry (e, slot, hash_list) | 281 | list_for_each_entry (e, slot, hash_list) |
278 | if (e->old_chunk == chunk) | 282 | if (chunk >= e->old_chunk && |
283 | chunk <= e->old_chunk + dm_consecutive_chunk_count(e)) | ||
279 | return e; | 284 | return e; |
280 | 285 | ||
281 | return NULL; | 286 | return NULL; |
@@ -307,6 +312,49 @@ static void free_pending_exception(struct dm_snap_pending_exception *pe) | |||
307 | mempool_free(pe, pending_pool); | 312 | mempool_free(pe, pending_pool); |
308 | } | 313 | } |
309 | 314 | ||
315 | static void insert_completed_exception(struct dm_snapshot *s, | ||
316 | struct dm_snap_exception *new_e) | ||
317 | { | ||
318 | struct exception_table *eh = &s->complete; | ||
319 | struct list_head *l; | ||
320 | struct dm_snap_exception *e = NULL; | ||
321 | |||
322 | l = &eh->table[exception_hash(eh, new_e->old_chunk)]; | ||
323 | |||
324 | /* Add immediately if this table doesn't support consecutive chunks */ | ||
325 | if (!eh->hash_shift) | ||
326 | goto out; | ||
327 | |||
328 | /* List is ordered by old_chunk */ | ||
329 | list_for_each_entry_reverse(e, l, hash_list) { | ||
330 | /* Insert after an existing chunk? */ | ||
331 | if (new_e->old_chunk == (e->old_chunk + | ||
332 | dm_consecutive_chunk_count(e) + 1) && | ||
333 | new_e->new_chunk == (dm_chunk_number(e->new_chunk) + | ||
334 | dm_consecutive_chunk_count(e) + 1)) { | ||
335 | dm_consecutive_chunk_count_inc(e); | ||
336 | free_exception(new_e); | ||
337 | return; | ||
338 | } | ||
339 | |||
340 | /* Insert before an existing chunk? */ | ||
341 | if (new_e->old_chunk == (e->old_chunk - 1) && | ||
342 | new_e->new_chunk == (dm_chunk_number(e->new_chunk) - 1)) { | ||
343 | dm_consecutive_chunk_count_inc(e); | ||
344 | e->old_chunk--; | ||
345 | e->new_chunk--; | ||
346 | free_exception(new_e); | ||
347 | return; | ||
348 | } | ||
349 | |||
350 | if (new_e->old_chunk > e->old_chunk) | ||
351 | break; | ||
352 | } | ||
353 | |||
354 | out: | ||
355 | list_add(&new_e->hash_list, e ? &e->hash_list : l); | ||
356 | } | ||
357 | |||
310 | int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new) | 358 | int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new) |
311 | { | 359 | { |
312 | struct dm_snap_exception *e; | 360 | struct dm_snap_exception *e; |
@@ -316,8 +364,12 @@ int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new) | |||
316 | return -ENOMEM; | 364 | return -ENOMEM; |
317 | 365 | ||
318 | e->old_chunk = old; | 366 | e->old_chunk = old; |
367 | |||
368 | /* Consecutive_count is implicitly initialised to zero */ | ||
319 | e->new_chunk = new; | 369 | e->new_chunk = new; |
320 | insert_exception(&s->complete, e); | 370 | |
371 | insert_completed_exception(s, e); | ||
372 | |||
321 | return 0; | 373 | return 0; |
322 | } | 374 | } |
323 | 375 | ||
@@ -334,16 +386,6 @@ static int calc_max_buckets(void) | |||
334 | } | 386 | } |
335 | 387 | ||
336 | /* | 388 | /* |
337 | * Rounds a number down to a power of 2. | ||
338 | */ | ||
339 | static uint32_t round_down(uint32_t n) | ||
340 | { | ||
341 | while (n & (n - 1)) | ||
342 | n &= (n - 1); | ||
343 | return n; | ||
344 | } | ||
345 | |||
346 | /* | ||
347 | * Allocate room for a suitable hash table. | 389 | * Allocate room for a suitable hash table. |
348 | */ | 390 | */ |
349 | static int init_hash_tables(struct dm_snapshot *s) | 391 | static int init_hash_tables(struct dm_snapshot *s) |
@@ -361,9 +403,9 @@ static int init_hash_tables(struct dm_snapshot *s) | |||
361 | hash_size = min(origin_dev_size, cow_dev_size) >> s->chunk_shift; | 403 | hash_size = min(origin_dev_size, cow_dev_size) >> s->chunk_shift; |
362 | hash_size = min(hash_size, max_buckets); | 404 | hash_size = min(hash_size, max_buckets); |
363 | 405 | ||
364 | /* Round it down to a power of 2 */ | 406 | hash_size = rounddown_pow_of_two(hash_size); |
365 | hash_size = round_down(hash_size); | 407 | if (init_exception_table(&s->complete, hash_size, |
366 | if (init_exception_table(&s->complete, hash_size)) | 408 | DM_CHUNK_CONSECUTIVE_BITS)) |
367 | return -ENOMEM; | 409 | return -ENOMEM; |
368 | 410 | ||
369 | /* | 411 | /* |
@@ -374,7 +416,7 @@ static int init_hash_tables(struct dm_snapshot *s) | |||
374 | if (hash_size < 64) | 416 | if (hash_size < 64) |
375 | hash_size = 64; | 417 | hash_size = 64; |
376 | 418 | ||
377 | if (init_exception_table(&s->pending, hash_size)) { | 419 | if (init_exception_table(&s->pending, hash_size, 0)) { |
378 | exit_exception_table(&s->complete, exception_cache); | 420 | exit_exception_table(&s->complete, exception_cache); |
379 | return -ENOMEM; | 421 | return -ENOMEM; |
380 | } | 422 | } |
@@ -733,7 +775,7 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success) | |||
733 | * Add a proper exception, and remove the | 775 | * Add a proper exception, and remove the |
734 | * in-flight exception from the list. | 776 | * in-flight exception from the list. |
735 | */ | 777 | */ |
736 | insert_exception(&s->complete, e); | 778 | insert_completed_exception(s, e); |
737 | 779 | ||
738 | out: | 780 | out: |
739 | remove_exception(&pe->e); | 781 | remove_exception(&pe->e); |
@@ -867,11 +909,12 @@ __find_pending_exception(struct dm_snapshot *s, struct bio *bio) | |||
867 | } | 909 | } |
868 | 910 | ||
869 | static void remap_exception(struct dm_snapshot *s, struct dm_snap_exception *e, | 911 | static void remap_exception(struct dm_snapshot *s, struct dm_snap_exception *e, |
870 | struct bio *bio) | 912 | struct bio *bio, chunk_t chunk) |
871 | { | 913 | { |
872 | bio->bi_bdev = s->cow->bdev; | 914 | bio->bi_bdev = s->cow->bdev; |
873 | bio->bi_sector = chunk_to_sector(s, e->new_chunk) + | 915 | bio->bi_sector = chunk_to_sector(s, dm_chunk_number(e->new_chunk) + |
874 | (bio->bi_sector & s->chunk_mask); | 916 | (chunk - e->old_chunk)) + |
917 | (bio->bi_sector & s->chunk_mask); | ||
875 | } | 918 | } |
876 | 919 | ||
877 | static int snapshot_map(struct dm_target *ti, struct bio *bio, | 920 | static int snapshot_map(struct dm_target *ti, struct bio *bio, |
@@ -902,7 +945,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, | |||
902 | /* If the block is already remapped - use that, else remap it */ | 945 | /* If the block is already remapped - use that, else remap it */ |
903 | e = lookup_exception(&s->complete, chunk); | 946 | e = lookup_exception(&s->complete, chunk); |
904 | if (e) { | 947 | if (e) { |
905 | remap_exception(s, e, bio); | 948 | remap_exception(s, e, bio, chunk); |
906 | goto out_unlock; | 949 | goto out_unlock; |
907 | } | 950 | } |
908 | 951 | ||
@@ -919,7 +962,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, | |||
919 | goto out_unlock; | 962 | goto out_unlock; |
920 | } | 963 | } |
921 | 964 | ||
922 | remap_exception(s, &pe->e, bio); | 965 | remap_exception(s, &pe->e, bio, chunk); |
923 | bio_list_add(&pe->snapshot_bios, bio); | 966 | bio_list_add(&pe->snapshot_bios, bio); |
924 | 967 | ||
925 | r = DM_MAPIO_SUBMITTED; | 968 | r = DM_MAPIO_SUBMITTED; |
@@ -1207,7 +1250,7 @@ static int origin_status(struct dm_target *ti, status_type_t type, char *result, | |||
1207 | 1250 | ||
1208 | static struct target_type origin_target = { | 1251 | static struct target_type origin_target = { |
1209 | .name = "snapshot-origin", | 1252 | .name = "snapshot-origin", |
1210 | .version = {1, 5, 0}, | 1253 | .version = {1, 6, 0}, |
1211 | .module = THIS_MODULE, | 1254 | .module = THIS_MODULE, |
1212 | .ctr = origin_ctr, | 1255 | .ctr = origin_ctr, |
1213 | .dtr = origin_dtr, | 1256 | .dtr = origin_dtr, |
@@ -1218,7 +1261,7 @@ static struct target_type origin_target = { | |||
1218 | 1261 | ||
1219 | static struct target_type snapshot_target = { | 1262 | static struct target_type snapshot_target = { |
1220 | .name = "snapshot", | 1263 | .name = "snapshot", |
1221 | .version = {1, 5, 0}, | 1264 | .version = {1, 6, 0}, |
1222 | .module = THIS_MODULE, | 1265 | .module = THIS_MODULE, |
1223 | .ctr = snapshot_ctr, | 1266 | .ctr = snapshot_ctr, |
1224 | .dtr = snapshot_dtr, | 1267 | .dtr = snapshot_dtr, |
diff --git a/drivers/md/dm-snap.h b/drivers/md/dm-snap.h index 650e0f1f51d8..93bce5d49742 100644 --- a/drivers/md/dm-snap.h +++ b/drivers/md/dm-snap.h | |||
@@ -16,19 +16,22 @@ | |||
16 | 16 | ||
17 | struct exception_table { | 17 | struct exception_table { |
18 | uint32_t hash_mask; | 18 | uint32_t hash_mask; |
19 | unsigned hash_shift; | ||
19 | struct list_head *table; | 20 | struct list_head *table; |
20 | }; | 21 | }; |
21 | 22 | ||
22 | /* | 23 | /* |
23 | * The snapshot code deals with largish chunks of the disk at a | 24 | * The snapshot code deals with largish chunks of the disk at a |
24 | * time. Typically 64k - 256k. | 25 | * time. Typically 32k - 512k. |
25 | */ | 26 | */ |
26 | /* FIXME: can we get away with limiting these to a uint32_t ? */ | ||
27 | typedef sector_t chunk_t; | 27 | typedef sector_t chunk_t; |
28 | 28 | ||
29 | /* | 29 | /* |
30 | * An exception is used where an old chunk of data has been | 30 | * An exception is used where an old chunk of data has been |
31 | * replaced by a new one. | 31 | * replaced by a new one. |
32 | * If chunk_t is 64 bits in size, the top 8 bits of new_chunk hold the number | ||
33 | * of chunks that follow contiguously. Remaining bits hold the number of the | ||
34 | * chunk within the device. | ||
32 | */ | 35 | */ |
33 | struct dm_snap_exception { | 36 | struct dm_snap_exception { |
34 | struct list_head hash_list; | 37 | struct list_head hash_list; |
@@ -38,6 +41,49 @@ struct dm_snap_exception { | |||
38 | }; | 41 | }; |
39 | 42 | ||
40 | /* | 43 | /* |
44 | * Funtions to manipulate consecutive chunks | ||
45 | */ | ||
46 | # if defined(CONFIG_LBD) || (BITS_PER_LONG == 64) | ||
47 | # define DM_CHUNK_CONSECUTIVE_BITS 8 | ||
48 | # define DM_CHUNK_NUMBER_BITS 56 | ||
49 | |||
50 | static inline chunk_t dm_chunk_number(chunk_t chunk) | ||
51 | { | ||
52 | return chunk & (chunk_t)((1ULL << DM_CHUNK_NUMBER_BITS) - 1ULL); | ||
53 | } | ||
54 | |||
55 | static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e) | ||
56 | { | ||
57 | return e->new_chunk >> DM_CHUNK_NUMBER_BITS; | ||
58 | } | ||
59 | |||
60 | static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e) | ||
61 | { | ||
62 | e->new_chunk += (1ULL << DM_CHUNK_NUMBER_BITS); | ||
63 | |||
64 | BUG_ON(!dm_consecutive_chunk_count(e)); | ||
65 | } | ||
66 | |||
67 | # else | ||
68 | # define DM_CHUNK_CONSECUTIVE_BITS 0 | ||
69 | |||
70 | static inline chunk_t dm_chunk_number(chunk_t chunk) | ||
71 | { | ||
72 | return chunk; | ||
73 | } | ||
74 | |||
75 | static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e) | ||
76 | { | ||
77 | return 0; | ||
78 | } | ||
79 | |||
80 | static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e) | ||
81 | { | ||
82 | } | ||
83 | |||
84 | # endif | ||
85 | |||
86 | /* | ||
41 | * Abstraction to handle the meta/layout of exception stores (the | 87 | * Abstraction to handle the meta/layout of exception stores (the |
42 | * COW device). | 88 | * COW device). |
43 | */ | 89 | */ |
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 969944a8aba2..4de90ab3968b 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c | |||
@@ -14,10 +14,13 @@ | |||
14 | #include <linux/log2.h> | 14 | #include <linux/log2.h> |
15 | 15 | ||
16 | #define DM_MSG_PREFIX "striped" | 16 | #define DM_MSG_PREFIX "striped" |
17 | #define DM_IO_ERROR_THRESHOLD 15 | ||
17 | 18 | ||
18 | struct stripe { | 19 | struct stripe { |
19 | struct dm_dev *dev; | 20 | struct dm_dev *dev; |
20 | sector_t physical_start; | 21 | sector_t physical_start; |
22 | |||
23 | atomic_t error_count; | ||
21 | }; | 24 | }; |
22 | 25 | ||
23 | struct stripe_c { | 26 | struct stripe_c { |
@@ -30,9 +33,29 @@ struct stripe_c { | |||
30 | uint32_t chunk_shift; | 33 | uint32_t chunk_shift; |
31 | sector_t chunk_mask; | 34 | sector_t chunk_mask; |
32 | 35 | ||
36 | /* Needed for handling events */ | ||
37 | struct dm_target *ti; | ||
38 | |||
39 | /* Work struct used for triggering events*/ | ||
40 | struct work_struct kstriped_ws; | ||
41 | |||
33 | struct stripe stripe[0]; | 42 | struct stripe stripe[0]; |
34 | }; | 43 | }; |
35 | 44 | ||
45 | static struct workqueue_struct *kstriped; | ||
46 | |||
47 | /* | ||
48 | * An event is triggered whenever a drive | ||
49 | * drops out of a stripe volume. | ||
50 | */ | ||
51 | static void trigger_event(struct work_struct *work) | ||
52 | { | ||
53 | struct stripe_c *sc = container_of(work, struct stripe_c, kstriped_ws); | ||
54 | |||
55 | dm_table_event(sc->ti->table); | ||
56 | |||
57 | } | ||
58 | |||
36 | static inline struct stripe_c *alloc_context(unsigned int stripes) | 59 | static inline struct stripe_c *alloc_context(unsigned int stripes) |
37 | { | 60 | { |
38 | size_t len; | 61 | size_t len; |
@@ -63,6 +86,7 @@ static int get_stripe(struct dm_target *ti, struct stripe_c *sc, | |||
63 | return -ENXIO; | 86 | return -ENXIO; |
64 | 87 | ||
65 | sc->stripe[stripe].physical_start = start; | 88 | sc->stripe[stripe].physical_start = start; |
89 | |||
66 | return 0; | 90 | return 0; |
67 | } | 91 | } |
68 | 92 | ||
@@ -135,6 +159,11 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
135 | return -ENOMEM; | 159 | return -ENOMEM; |
136 | } | 160 | } |
137 | 161 | ||
162 | INIT_WORK(&sc->kstriped_ws, trigger_event); | ||
163 | |||
164 | /* Set pointer to dm target; used in trigger_event */ | ||
165 | sc->ti = ti; | ||
166 | |||
138 | sc->stripes = stripes; | 167 | sc->stripes = stripes; |
139 | sc->stripe_width = width; | 168 | sc->stripe_width = width; |
140 | ti->split_io = chunk_size; | 169 | ti->split_io = chunk_size; |
@@ -158,9 +187,11 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
158 | kfree(sc); | 187 | kfree(sc); |
159 | return r; | 188 | return r; |
160 | } | 189 | } |
190 | atomic_set(&(sc->stripe[i].error_count), 0); | ||
161 | } | 191 | } |
162 | 192 | ||
163 | ti->private = sc; | 193 | ti->private = sc; |
194 | |||
164 | return 0; | 195 | return 0; |
165 | } | 196 | } |
166 | 197 | ||
@@ -172,6 +203,7 @@ static void stripe_dtr(struct dm_target *ti) | |||
172 | for (i = 0; i < sc->stripes; i++) | 203 | for (i = 0; i < sc->stripes; i++) |
173 | dm_put_device(ti, sc->stripe[i].dev); | 204 | dm_put_device(ti, sc->stripe[i].dev); |
174 | 205 | ||
206 | flush_workqueue(kstriped); | ||
175 | kfree(sc); | 207 | kfree(sc); |
176 | } | 208 | } |
177 | 209 | ||
@@ -190,16 +222,37 @@ static int stripe_map(struct dm_target *ti, struct bio *bio, | |||
190 | return DM_MAPIO_REMAPPED; | 222 | return DM_MAPIO_REMAPPED; |
191 | } | 223 | } |
192 | 224 | ||
225 | /* | ||
226 | * Stripe status: | ||
227 | * | ||
228 | * INFO | ||
229 | * #stripes [stripe_name <stripe_name>] [group word count] | ||
230 | * [error count 'A|D' <error count 'A|D'>] | ||
231 | * | ||
232 | * TABLE | ||
233 | * #stripes [stripe chunk size] | ||
234 | * [stripe_name physical_start <stripe_name physical_start>] | ||
235 | * | ||
236 | */ | ||
237 | |||
193 | static int stripe_status(struct dm_target *ti, | 238 | static int stripe_status(struct dm_target *ti, |
194 | status_type_t type, char *result, unsigned int maxlen) | 239 | status_type_t type, char *result, unsigned int maxlen) |
195 | { | 240 | { |
196 | struct stripe_c *sc = (struct stripe_c *) ti->private; | 241 | struct stripe_c *sc = (struct stripe_c *) ti->private; |
242 | char buffer[sc->stripes + 1]; | ||
197 | unsigned int sz = 0; | 243 | unsigned int sz = 0; |
198 | unsigned int i; | 244 | unsigned int i; |
199 | 245 | ||
200 | switch (type) { | 246 | switch (type) { |
201 | case STATUSTYPE_INFO: | 247 | case STATUSTYPE_INFO: |
202 | result[0] = '\0'; | 248 | DMEMIT("%d ", sc->stripes); |
249 | for (i = 0; i < sc->stripes; i++) { | ||
250 | DMEMIT("%s ", sc->stripe[i].dev->name); | ||
251 | buffer[i] = atomic_read(&(sc->stripe[i].error_count)) ? | ||
252 | 'D' : 'A'; | ||
253 | } | ||
254 | buffer[i] = '\0'; | ||
255 | DMEMIT("1 %s", buffer); | ||
203 | break; | 256 | break; |
204 | 257 | ||
205 | case STATUSTYPE_TABLE: | 258 | case STATUSTYPE_TABLE: |
@@ -213,13 +266,52 @@ static int stripe_status(struct dm_target *ti, | |||
213 | return 0; | 266 | return 0; |
214 | } | 267 | } |
215 | 268 | ||
269 | static int stripe_end_io(struct dm_target *ti, struct bio *bio, | ||
270 | int error, union map_info *map_context) | ||
271 | { | ||
272 | unsigned i; | ||
273 | char major_minor[16]; | ||
274 | struct stripe_c *sc = ti->private; | ||
275 | |||
276 | if (!error) | ||
277 | return 0; /* I/O complete */ | ||
278 | |||
279 | if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio)) | ||
280 | return error; | ||
281 | |||
282 | if (error == -EOPNOTSUPP) | ||
283 | return error; | ||
284 | |||
285 | memset(major_minor, 0, sizeof(major_minor)); | ||
286 | sprintf(major_minor, "%d:%d", | ||
287 | bio->bi_bdev->bd_disk->major, | ||
288 | bio->bi_bdev->bd_disk->first_minor); | ||
289 | |||
290 | /* | ||
291 | * Test to see which stripe drive triggered the event | ||
292 | * and increment error count for all stripes on that device. | ||
293 | * If the error count for a given device exceeds the threshold | ||
294 | * value we will no longer trigger any further events. | ||
295 | */ | ||
296 | for (i = 0; i < sc->stripes; i++) | ||
297 | if (!strcmp(sc->stripe[i].dev->name, major_minor)) { | ||
298 | atomic_inc(&(sc->stripe[i].error_count)); | ||
299 | if (atomic_read(&(sc->stripe[i].error_count)) < | ||
300 | DM_IO_ERROR_THRESHOLD) | ||
301 | queue_work(kstriped, &sc->kstriped_ws); | ||
302 | } | ||
303 | |||
304 | return error; | ||
305 | } | ||
306 | |||
216 | static struct target_type stripe_target = { | 307 | static struct target_type stripe_target = { |
217 | .name = "striped", | 308 | .name = "striped", |
218 | .version= {1, 0, 2}, | 309 | .version = {1, 1, 0}, |
219 | .module = THIS_MODULE, | 310 | .module = THIS_MODULE, |
220 | .ctr = stripe_ctr, | 311 | .ctr = stripe_ctr, |
221 | .dtr = stripe_dtr, | 312 | .dtr = stripe_dtr, |
222 | .map = stripe_map, | 313 | .map = stripe_map, |
314 | .end_io = stripe_end_io, | ||
223 | .status = stripe_status, | 315 | .status = stripe_status, |
224 | }; | 316 | }; |
225 | 317 | ||
@@ -231,6 +323,13 @@ int __init dm_stripe_init(void) | |||
231 | if (r < 0) | 323 | if (r < 0) |
232 | DMWARN("target registration failed"); | 324 | DMWARN("target registration failed"); |
233 | 325 | ||
326 | kstriped = create_singlethread_workqueue("kstriped"); | ||
327 | if (!kstriped) { | ||
328 | DMERR("failed to create workqueue kstriped"); | ||
329 | dm_unregister_target(&stripe_target); | ||
330 | return -ENOMEM; | ||
331 | } | ||
332 | |||
234 | return r; | 333 | return r; |
235 | } | 334 | } |
236 | 335 | ||
@@ -239,5 +338,7 @@ void dm_stripe_exit(void) | |||
239 | if (dm_unregister_target(&stripe_target)) | 338 | if (dm_unregister_target(&stripe_target)) |
240 | DMWARN("target unregistration failed"); | 339 | DMWARN("target unregistration failed"); |
241 | 340 | ||
341 | destroy_workqueue(kstriped); | ||
342 | |||
242 | return; | 343 | return; |
243 | } | 344 | } |
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 47818d8249cb..f16062982383 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c | |||
@@ -287,9 +287,8 @@ static void free_devices(struct list_head *devices) | |||
287 | { | 287 | { |
288 | struct list_head *tmp, *next; | 288 | struct list_head *tmp, *next; |
289 | 289 | ||
290 | for (tmp = devices->next; tmp != devices; tmp = next) { | 290 | list_for_each_safe(tmp, next, devices) { |
291 | struct dm_dev *dd = list_entry(tmp, struct dm_dev, list); | 291 | struct dm_dev *dd = list_entry(tmp, struct dm_dev, list); |
292 | next = tmp->next; | ||
293 | kfree(dd); | 292 | kfree(dd); |
294 | } | 293 | } |
295 | } | 294 | } |
@@ -476,7 +475,7 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti, | |||
476 | int mode, struct dm_dev **result) | 475 | int mode, struct dm_dev **result) |
477 | { | 476 | { |
478 | int r; | 477 | int r; |
479 | dev_t dev; | 478 | dev_t uninitialized_var(dev); |
480 | struct dm_dev *dd; | 479 | struct dm_dev *dd; |
481 | unsigned int major, minor; | 480 | unsigned int major, minor; |
482 | 481 | ||
@@ -805,7 +804,7 @@ static int setup_indexes(struct dm_table *t) | |||
805 | return -ENOMEM; | 804 | return -ENOMEM; |
806 | 805 | ||
807 | /* set up internal nodes, bottom-up */ | 806 | /* set up internal nodes, bottom-up */ |
808 | for (i = t->depth - 2, total = 0; i >= 0; i--) { | 807 | for (i = t->depth - 2; i >= 0; i--) { |
809 | t->index[i] = indexes; | 808 | t->index[i] = indexes; |
810 | indexes += (KEYS_PER_NODE * t->counts[i]); | 809 | indexes += (KEYS_PER_NODE * t->counts[i]); |
811 | setup_btree_index(i, t); | 810 | setup_btree_index(i, t); |
@@ -993,12 +992,11 @@ int dm_table_resume_targets(struct dm_table *t) | |||
993 | 992 | ||
994 | int dm_table_any_congested(struct dm_table *t, int bdi_bits) | 993 | int dm_table_any_congested(struct dm_table *t, int bdi_bits) |
995 | { | 994 | { |
996 | struct list_head *d, *devices; | 995 | struct dm_dev *dd; |
996 | struct list_head *devices = dm_table_get_devices(t); | ||
997 | int r = 0; | 997 | int r = 0; |
998 | 998 | ||
999 | devices = dm_table_get_devices(t); | 999 | list_for_each_entry(dd, devices, list) { |
1000 | for (d = devices->next; d != devices; d = d->next) { | ||
1001 | struct dm_dev *dd = list_entry(d, struct dm_dev, list); | ||
1002 | struct request_queue *q = bdev_get_queue(dd->bdev); | 1000 | struct request_queue *q = bdev_get_queue(dd->bdev); |
1003 | r |= bdi_congested(&q->backing_dev_info, bdi_bits); | 1001 | r |= bdi_congested(&q->backing_dev_info, bdi_bits); |
1004 | } | 1002 | } |
@@ -1008,10 +1006,10 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits) | |||
1008 | 1006 | ||
1009 | void dm_table_unplug_all(struct dm_table *t) | 1007 | void dm_table_unplug_all(struct dm_table *t) |
1010 | { | 1008 | { |
1011 | struct list_head *d, *devices = dm_table_get_devices(t); | 1009 | struct dm_dev *dd; |
1010 | struct list_head *devices = dm_table_get_devices(t); | ||
1012 | 1011 | ||
1013 | for (d = devices->next; d != devices; d = d->next) { | 1012 | list_for_each_entry(dd, devices, list) { |
1014 | struct dm_dev *dd = list_entry(d, struct dm_dev, list); | ||
1015 | struct request_queue *q = bdev_get_queue(dd->bdev); | 1013 | struct request_queue *q = bdev_get_queue(dd->bdev); |
1016 | 1014 | ||
1017 | blk_unplug(q); | 1015 | blk_unplug(q); |
diff --git a/drivers/md/dm.c b/drivers/md/dm.c index f2d24eb3208c..6617ce4af095 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c | |||
@@ -71,9 +71,22 @@ union map_info *dm_get_mapinfo(struct bio *bio) | |||
71 | #define DMF_DELETING 4 | 71 | #define DMF_DELETING 4 |
72 | #define DMF_NOFLUSH_SUSPENDING 5 | 72 | #define DMF_NOFLUSH_SUSPENDING 5 |
73 | 73 | ||
74 | /* | ||
75 | * Work processed by per-device workqueue. | ||
76 | */ | ||
77 | struct dm_wq_req { | ||
78 | enum { | ||
79 | DM_WQ_FLUSH_ALL, | ||
80 | DM_WQ_FLUSH_DEFERRED, | ||
81 | } type; | ||
82 | struct work_struct work; | ||
83 | struct mapped_device *md; | ||
84 | void *context; | ||
85 | }; | ||
86 | |||
74 | struct mapped_device { | 87 | struct mapped_device { |
75 | struct rw_semaphore io_lock; | 88 | struct rw_semaphore io_lock; |
76 | struct semaphore suspend_lock; | 89 | struct mutex suspend_lock; |
77 | spinlock_t pushback_lock; | 90 | spinlock_t pushback_lock; |
78 | rwlock_t map_lock; | 91 | rwlock_t map_lock; |
79 | atomic_t holders; | 92 | atomic_t holders; |
@@ -96,6 +109,11 @@ struct mapped_device { | |||
96 | struct bio_list pushback; | 109 | struct bio_list pushback; |
97 | 110 | ||
98 | /* | 111 | /* |
112 | * Processing queue (flush/barriers) | ||
113 | */ | ||
114 | struct workqueue_struct *wq; | ||
115 | |||
116 | /* | ||
99 | * The current mapping. | 117 | * The current mapping. |
100 | */ | 118 | */ |
101 | struct dm_table *map; | 119 | struct dm_table *map; |
@@ -181,7 +199,7 @@ static void local_exit(void) | |||
181 | DMINFO("cleaned up"); | 199 | DMINFO("cleaned up"); |
182 | } | 200 | } |
183 | 201 | ||
184 | int (*_inits[])(void) __initdata = { | 202 | static int (*_inits[])(void) __initdata = { |
185 | local_init, | 203 | local_init, |
186 | dm_target_init, | 204 | dm_target_init, |
187 | dm_linear_init, | 205 | dm_linear_init, |
@@ -189,7 +207,7 @@ int (*_inits[])(void) __initdata = { | |||
189 | dm_interface_init, | 207 | dm_interface_init, |
190 | }; | 208 | }; |
191 | 209 | ||
192 | void (*_exits[])(void) = { | 210 | static void (*_exits[])(void) = { |
193 | local_exit, | 211 | local_exit, |
194 | dm_target_exit, | 212 | dm_target_exit, |
195 | dm_linear_exit, | 213 | dm_linear_exit, |
@@ -982,7 +1000,7 @@ static struct mapped_device *alloc_dev(int minor) | |||
982 | } | 1000 | } |
983 | 1001 | ||
984 | if (!try_module_get(THIS_MODULE)) | 1002 | if (!try_module_get(THIS_MODULE)) |
985 | goto bad0; | 1003 | goto bad_module_get; |
986 | 1004 | ||
987 | /* get a minor number for the dev */ | 1005 | /* get a minor number for the dev */ |
988 | if (minor == DM_ANY_MINOR) | 1006 | if (minor == DM_ANY_MINOR) |
@@ -990,11 +1008,11 @@ static struct mapped_device *alloc_dev(int minor) | |||
990 | else | 1008 | else |
991 | r = specific_minor(md, minor); | 1009 | r = specific_minor(md, minor); |
992 | if (r < 0) | 1010 | if (r < 0) |
993 | goto bad1; | 1011 | goto bad_minor; |
994 | 1012 | ||
995 | memset(md, 0, sizeof(*md)); | 1013 | memset(md, 0, sizeof(*md)); |
996 | init_rwsem(&md->io_lock); | 1014 | init_rwsem(&md->io_lock); |
997 | init_MUTEX(&md->suspend_lock); | 1015 | mutex_init(&md->suspend_lock); |
998 | spin_lock_init(&md->pushback_lock); | 1016 | spin_lock_init(&md->pushback_lock); |
999 | rwlock_init(&md->map_lock); | 1017 | rwlock_init(&md->map_lock); |
1000 | atomic_set(&md->holders, 1); | 1018 | atomic_set(&md->holders, 1); |
@@ -1006,7 +1024,7 @@ static struct mapped_device *alloc_dev(int minor) | |||
1006 | 1024 | ||
1007 | md->queue = blk_alloc_queue(GFP_KERNEL); | 1025 | md->queue = blk_alloc_queue(GFP_KERNEL); |
1008 | if (!md->queue) | 1026 | if (!md->queue) |
1009 | goto bad1_free_minor; | 1027 | goto bad_queue; |
1010 | 1028 | ||
1011 | md->queue->queuedata = md; | 1029 | md->queue->queuedata = md; |
1012 | md->queue->backing_dev_info.congested_fn = dm_any_congested; | 1030 | md->queue->backing_dev_info.congested_fn = dm_any_congested; |
@@ -1017,11 +1035,11 @@ static struct mapped_device *alloc_dev(int minor) | |||
1017 | 1035 | ||
1018 | md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache); | 1036 | md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache); |
1019 | if (!md->io_pool) | 1037 | if (!md->io_pool) |
1020 | goto bad2; | 1038 | goto bad_io_pool; |
1021 | 1039 | ||
1022 | md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache); | 1040 | md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache); |
1023 | if (!md->tio_pool) | 1041 | if (!md->tio_pool) |
1024 | goto bad3; | 1042 | goto bad_tio_pool; |
1025 | 1043 | ||
1026 | md->bs = bioset_create(16, 16); | 1044 | md->bs = bioset_create(16, 16); |
1027 | if (!md->bs) | 1045 | if (!md->bs) |
@@ -1029,7 +1047,7 @@ static struct mapped_device *alloc_dev(int minor) | |||
1029 | 1047 | ||
1030 | md->disk = alloc_disk(1); | 1048 | md->disk = alloc_disk(1); |
1031 | if (!md->disk) | 1049 | if (!md->disk) |
1032 | goto bad4; | 1050 | goto bad_disk; |
1033 | 1051 | ||
1034 | atomic_set(&md->pending, 0); | 1052 | atomic_set(&md->pending, 0); |
1035 | init_waitqueue_head(&md->wait); | 1053 | init_waitqueue_head(&md->wait); |
@@ -1044,6 +1062,10 @@ static struct mapped_device *alloc_dev(int minor) | |||
1044 | add_disk(md->disk); | 1062 | add_disk(md->disk); |
1045 | format_dev_t(md->name, MKDEV(_major, minor)); | 1063 | format_dev_t(md->name, MKDEV(_major, minor)); |
1046 | 1064 | ||
1065 | md->wq = create_singlethread_workqueue("kdmflush"); | ||
1066 | if (!md->wq) | ||
1067 | goto bad_thread; | ||
1068 | |||
1047 | /* Populate the mapping, nobody knows we exist yet */ | 1069 | /* Populate the mapping, nobody knows we exist yet */ |
1048 | spin_lock(&_minor_lock); | 1070 | spin_lock(&_minor_lock); |
1049 | old_md = idr_replace(&_minor_idr, md, minor); | 1071 | old_md = idr_replace(&_minor_idr, md, minor); |
@@ -1053,19 +1075,21 @@ static struct mapped_device *alloc_dev(int minor) | |||
1053 | 1075 | ||
1054 | return md; | 1076 | return md; |
1055 | 1077 | ||
1056 | bad4: | 1078 | bad_thread: |
1079 | put_disk(md->disk); | ||
1080 | bad_disk: | ||
1057 | bioset_free(md->bs); | 1081 | bioset_free(md->bs); |
1058 | bad_no_bioset: | 1082 | bad_no_bioset: |
1059 | mempool_destroy(md->tio_pool); | 1083 | mempool_destroy(md->tio_pool); |
1060 | bad3: | 1084 | bad_tio_pool: |
1061 | mempool_destroy(md->io_pool); | 1085 | mempool_destroy(md->io_pool); |
1062 | bad2: | 1086 | bad_io_pool: |
1063 | blk_cleanup_queue(md->queue); | 1087 | blk_cleanup_queue(md->queue); |
1064 | bad1_free_minor: | 1088 | bad_queue: |
1065 | free_minor(minor); | 1089 | free_minor(minor); |
1066 | bad1: | 1090 | bad_minor: |
1067 | module_put(THIS_MODULE); | 1091 | module_put(THIS_MODULE); |
1068 | bad0: | 1092 | bad_module_get: |
1069 | kfree(md); | 1093 | kfree(md); |
1070 | return NULL; | 1094 | return NULL; |
1071 | } | 1095 | } |
@@ -1080,6 +1104,7 @@ static void free_dev(struct mapped_device *md) | |||
1080 | unlock_fs(md); | 1104 | unlock_fs(md); |
1081 | bdput(md->suspended_bdev); | 1105 | bdput(md->suspended_bdev); |
1082 | } | 1106 | } |
1107 | destroy_workqueue(md->wq); | ||
1083 | mempool_destroy(md->tio_pool); | 1108 | mempool_destroy(md->tio_pool); |
1084 | mempool_destroy(md->io_pool); | 1109 | mempool_destroy(md->io_pool); |
1085 | bioset_free(md->bs); | 1110 | bioset_free(md->bs); |
@@ -1259,20 +1284,91 @@ void dm_put(struct mapped_device *md) | |||
1259 | } | 1284 | } |
1260 | EXPORT_SYMBOL_GPL(dm_put); | 1285 | EXPORT_SYMBOL_GPL(dm_put); |
1261 | 1286 | ||
1287 | static int dm_wait_for_completion(struct mapped_device *md) | ||
1288 | { | ||
1289 | int r = 0; | ||
1290 | |||
1291 | while (1) { | ||
1292 | set_current_state(TASK_INTERRUPTIBLE); | ||
1293 | |||
1294 | smp_mb(); | ||
1295 | if (!atomic_read(&md->pending)) | ||
1296 | break; | ||
1297 | |||
1298 | if (signal_pending(current)) { | ||
1299 | r = -EINTR; | ||
1300 | break; | ||
1301 | } | ||
1302 | |||
1303 | io_schedule(); | ||
1304 | } | ||
1305 | set_current_state(TASK_RUNNING); | ||
1306 | |||
1307 | return r; | ||
1308 | } | ||
1309 | |||
1262 | /* | 1310 | /* |
1263 | * Process the deferred bios | 1311 | * Process the deferred bios |
1264 | */ | 1312 | */ |
1265 | static void __flush_deferred_io(struct mapped_device *md, struct bio *c) | 1313 | static void __flush_deferred_io(struct mapped_device *md) |
1266 | { | 1314 | { |
1267 | struct bio *n; | 1315 | struct bio *c; |
1268 | 1316 | ||
1269 | while (c) { | 1317 | while ((c = bio_list_pop(&md->deferred))) { |
1270 | n = c->bi_next; | ||
1271 | c->bi_next = NULL; | ||
1272 | if (__split_bio(md, c)) | 1318 | if (__split_bio(md, c)) |
1273 | bio_io_error(c); | 1319 | bio_io_error(c); |
1274 | c = n; | ||
1275 | } | 1320 | } |
1321 | |||
1322 | clear_bit(DMF_BLOCK_IO, &md->flags); | ||
1323 | } | ||
1324 | |||
1325 | static void __merge_pushback_list(struct mapped_device *md) | ||
1326 | { | ||
1327 | unsigned long flags; | ||
1328 | |||
1329 | spin_lock_irqsave(&md->pushback_lock, flags); | ||
1330 | clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); | ||
1331 | bio_list_merge_head(&md->deferred, &md->pushback); | ||
1332 | bio_list_init(&md->pushback); | ||
1333 | spin_unlock_irqrestore(&md->pushback_lock, flags); | ||
1334 | } | ||
1335 | |||
1336 | static void dm_wq_work(struct work_struct *work) | ||
1337 | { | ||
1338 | struct dm_wq_req *req = container_of(work, struct dm_wq_req, work); | ||
1339 | struct mapped_device *md = req->md; | ||
1340 | |||
1341 | down_write(&md->io_lock); | ||
1342 | switch (req->type) { | ||
1343 | case DM_WQ_FLUSH_ALL: | ||
1344 | __merge_pushback_list(md); | ||
1345 | /* pass through */ | ||
1346 | case DM_WQ_FLUSH_DEFERRED: | ||
1347 | __flush_deferred_io(md); | ||
1348 | break; | ||
1349 | default: | ||
1350 | DMERR("dm_wq_work: unrecognised work type %d", req->type); | ||
1351 | BUG(); | ||
1352 | } | ||
1353 | up_write(&md->io_lock); | ||
1354 | } | ||
1355 | |||
1356 | static void dm_wq_queue(struct mapped_device *md, int type, void *context, | ||
1357 | struct dm_wq_req *req) | ||
1358 | { | ||
1359 | req->type = type; | ||
1360 | req->md = md; | ||
1361 | req->context = context; | ||
1362 | INIT_WORK(&req->work, dm_wq_work); | ||
1363 | queue_work(md->wq, &req->work); | ||
1364 | } | ||
1365 | |||
1366 | static void dm_queue_flush(struct mapped_device *md, int type, void *context) | ||
1367 | { | ||
1368 | struct dm_wq_req req; | ||
1369 | |||
1370 | dm_wq_queue(md, type, context, &req); | ||
1371 | flush_workqueue(md->wq); | ||
1276 | } | 1372 | } |
1277 | 1373 | ||
1278 | /* | 1374 | /* |
@@ -1282,7 +1378,7 @@ int dm_swap_table(struct mapped_device *md, struct dm_table *table) | |||
1282 | { | 1378 | { |
1283 | int r = -EINVAL; | 1379 | int r = -EINVAL; |
1284 | 1380 | ||
1285 | down(&md->suspend_lock); | 1381 | mutex_lock(&md->suspend_lock); |
1286 | 1382 | ||
1287 | /* device must be suspended */ | 1383 | /* device must be suspended */ |
1288 | if (!dm_suspended(md)) | 1384 | if (!dm_suspended(md)) |
@@ -1297,7 +1393,7 @@ int dm_swap_table(struct mapped_device *md, struct dm_table *table) | |||
1297 | r = __bind(md, table); | 1393 | r = __bind(md, table); |
1298 | 1394 | ||
1299 | out: | 1395 | out: |
1300 | up(&md->suspend_lock); | 1396 | mutex_unlock(&md->suspend_lock); |
1301 | return r; | 1397 | return r; |
1302 | } | 1398 | } |
1303 | 1399 | ||
@@ -1346,17 +1442,17 @@ static void unlock_fs(struct mapped_device *md) | |||
1346 | int dm_suspend(struct mapped_device *md, unsigned suspend_flags) | 1442 | int dm_suspend(struct mapped_device *md, unsigned suspend_flags) |
1347 | { | 1443 | { |
1348 | struct dm_table *map = NULL; | 1444 | struct dm_table *map = NULL; |
1349 | unsigned long flags; | ||
1350 | DECLARE_WAITQUEUE(wait, current); | 1445 | DECLARE_WAITQUEUE(wait, current); |
1351 | struct bio *def; | 1446 | int r = 0; |
1352 | int r = -EINVAL; | ||
1353 | int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0; | 1447 | int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0; |
1354 | int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0; | 1448 | int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0; |
1355 | 1449 | ||
1356 | down(&md->suspend_lock); | 1450 | mutex_lock(&md->suspend_lock); |
1357 | 1451 | ||
1358 | if (dm_suspended(md)) | 1452 | if (dm_suspended(md)) { |
1453 | r = -EINVAL; | ||
1359 | goto out_unlock; | 1454 | goto out_unlock; |
1455 | } | ||
1360 | 1456 | ||
1361 | map = dm_get_table(md); | 1457 | map = dm_get_table(md); |
1362 | 1458 | ||
@@ -1378,16 +1474,16 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) | |||
1378 | r = -ENOMEM; | 1474 | r = -ENOMEM; |
1379 | goto flush_and_out; | 1475 | goto flush_and_out; |
1380 | } | 1476 | } |
1381 | } | ||
1382 | 1477 | ||
1383 | /* | 1478 | /* |
1384 | * Flush I/O to the device. | 1479 | * Flush I/O to the device. noflush supersedes do_lockfs, |
1385 | * noflush supersedes do_lockfs, because lock_fs() needs to flush I/Os. | 1480 | * because lock_fs() needs to flush I/Os. |
1386 | */ | 1481 | */ |
1387 | if (do_lockfs && !noflush) { | 1482 | if (do_lockfs) { |
1388 | r = lock_fs(md); | 1483 | r = lock_fs(md); |
1389 | if (r) | 1484 | if (r) |
1390 | goto out; | 1485 | goto out; |
1486 | } | ||
1391 | } | 1487 | } |
1392 | 1488 | ||
1393 | /* | 1489 | /* |
@@ -1404,66 +1500,36 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) | |||
1404 | dm_table_unplug_all(map); | 1500 | dm_table_unplug_all(map); |
1405 | 1501 | ||
1406 | /* | 1502 | /* |
1407 | * Then we wait for the already mapped ios to | 1503 | * Wait for the already-mapped ios to complete. |
1408 | * complete. | ||
1409 | */ | 1504 | */ |
1410 | while (1) { | 1505 | r = dm_wait_for_completion(md); |
1411 | set_current_state(TASK_INTERRUPTIBLE); | ||
1412 | |||
1413 | if (!atomic_read(&md->pending) || signal_pending(current)) | ||
1414 | break; | ||
1415 | |||
1416 | io_schedule(); | ||
1417 | } | ||
1418 | set_current_state(TASK_RUNNING); | ||
1419 | 1506 | ||
1420 | down_write(&md->io_lock); | 1507 | down_write(&md->io_lock); |
1421 | remove_wait_queue(&md->wait, &wait); | 1508 | remove_wait_queue(&md->wait, &wait); |
1422 | 1509 | ||
1423 | if (noflush) { | 1510 | if (noflush) |
1424 | spin_lock_irqsave(&md->pushback_lock, flags); | 1511 | __merge_pushback_list(md); |
1425 | clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); | 1512 | up_write(&md->io_lock); |
1426 | bio_list_merge_head(&md->deferred, &md->pushback); | ||
1427 | bio_list_init(&md->pushback); | ||
1428 | spin_unlock_irqrestore(&md->pushback_lock, flags); | ||
1429 | } | ||
1430 | 1513 | ||
1431 | /* were we interrupted ? */ | 1514 | /* were we interrupted ? */ |
1432 | r = -EINTR; | 1515 | if (r < 0) { |
1433 | if (atomic_read(&md->pending)) { | 1516 | dm_queue_flush(md, DM_WQ_FLUSH_DEFERRED, NULL); |
1434 | clear_bit(DMF_BLOCK_IO, &md->flags); | 1517 | |
1435 | def = bio_list_get(&md->deferred); | ||
1436 | __flush_deferred_io(md, def); | ||
1437 | up_write(&md->io_lock); | ||
1438 | unlock_fs(md); | 1518 | unlock_fs(md); |
1439 | goto out; /* pushback list is already flushed, so skip flush */ | 1519 | goto out; /* pushback list is already flushed, so skip flush */ |
1440 | } | 1520 | } |
1441 | up_write(&md->io_lock); | ||
1442 | 1521 | ||
1443 | dm_table_postsuspend_targets(map); | 1522 | dm_table_postsuspend_targets(map); |
1444 | 1523 | ||
1445 | set_bit(DMF_SUSPENDED, &md->flags); | 1524 | set_bit(DMF_SUSPENDED, &md->flags); |
1446 | 1525 | ||
1447 | r = 0; | ||
1448 | |||
1449 | flush_and_out: | 1526 | flush_and_out: |
1450 | if (r && noflush) { | 1527 | if (r && noflush) |
1451 | /* | 1528 | /* |
1452 | * Because there may be already I/Os in the pushback list, | 1529 | * Because there may be already I/Os in the pushback list, |
1453 | * flush them before return. | 1530 | * flush them before return. |
1454 | */ | 1531 | */ |
1455 | down_write(&md->io_lock); | 1532 | dm_queue_flush(md, DM_WQ_FLUSH_ALL, NULL); |
1456 | |||
1457 | spin_lock_irqsave(&md->pushback_lock, flags); | ||
1458 | clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); | ||
1459 | bio_list_merge_head(&md->deferred, &md->pushback); | ||
1460 | bio_list_init(&md->pushback); | ||
1461 | spin_unlock_irqrestore(&md->pushback_lock, flags); | ||
1462 | |||
1463 | def = bio_list_get(&md->deferred); | ||
1464 | __flush_deferred_io(md, def); | ||
1465 | up_write(&md->io_lock); | ||
1466 | } | ||
1467 | 1533 | ||
1468 | out: | 1534 | out: |
1469 | if (r && md->suspended_bdev) { | 1535 | if (r && md->suspended_bdev) { |
@@ -1474,17 +1540,16 @@ out: | |||
1474 | dm_table_put(map); | 1540 | dm_table_put(map); |
1475 | 1541 | ||
1476 | out_unlock: | 1542 | out_unlock: |
1477 | up(&md->suspend_lock); | 1543 | mutex_unlock(&md->suspend_lock); |
1478 | return r; | 1544 | return r; |
1479 | } | 1545 | } |
1480 | 1546 | ||
1481 | int dm_resume(struct mapped_device *md) | 1547 | int dm_resume(struct mapped_device *md) |
1482 | { | 1548 | { |
1483 | int r = -EINVAL; | 1549 | int r = -EINVAL; |
1484 | struct bio *def; | ||
1485 | struct dm_table *map = NULL; | 1550 | struct dm_table *map = NULL; |
1486 | 1551 | ||
1487 | down(&md->suspend_lock); | 1552 | mutex_lock(&md->suspend_lock); |
1488 | if (!dm_suspended(md)) | 1553 | if (!dm_suspended(md)) |
1489 | goto out; | 1554 | goto out; |
1490 | 1555 | ||
@@ -1496,12 +1561,7 @@ int dm_resume(struct mapped_device *md) | |||
1496 | if (r) | 1561 | if (r) |
1497 | goto out; | 1562 | goto out; |
1498 | 1563 | ||
1499 | down_write(&md->io_lock); | 1564 | dm_queue_flush(md, DM_WQ_FLUSH_DEFERRED, NULL); |
1500 | clear_bit(DMF_BLOCK_IO, &md->flags); | ||
1501 | |||
1502 | def = bio_list_get(&md->deferred); | ||
1503 | __flush_deferred_io(md, def); | ||
1504 | up_write(&md->io_lock); | ||
1505 | 1565 | ||
1506 | unlock_fs(md); | 1566 | unlock_fs(md); |
1507 | 1567 | ||
@@ -1520,7 +1580,7 @@ int dm_resume(struct mapped_device *md) | |||
1520 | 1580 | ||
1521 | out: | 1581 | out: |
1522 | dm_table_put(map); | 1582 | dm_table_put(map); |
1523 | up(&md->suspend_lock); | 1583 | mutex_unlock(&md->suspend_lock); |
1524 | 1584 | ||
1525 | return r; | 1585 | return r; |
1526 | } | 1586 | } |