aboutsummaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@woody.linux-foundation.org>2008-02-07 22:30:50 -0500
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2008-02-07 22:30:50 -0500
commita4ffc0a0b240a29cbe489f6db9dae112a49ef1c1 (patch)
tree9719c706444f4b720aff2bb4bdf23a4be3f4b1e3 /drivers
parentd7511ec8115487ccea2ce93bf58d5e5cd2c1c0a3 (diff)
parentaf195ac82e38ba802fd86b5a014ed05ef6dd88bb (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-2.6-dm
* git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-2.6-dm: (44 commits) dm raid1: report fault status dm raid1: handle read failures dm raid1: fix EIO after log failure dm raid1: handle recovery failures dm raid1: handle write failures dm snapshot: combine consecutive exceptions in memory dm: stripe enhanced status return dm: stripe trigger event on failure dm log: auto load modules dm: move deferred bio flushing to workqueue dm crypt: use async crypto dm crypt: prepare async callback fn dm crypt: add completion for async dm crypt: add async request mempool dm crypt: extract scatterlist processing dm crypt: tidy io ref counting dm crypt: introduce crypt_write_io_loop dm crypt: abstract crypt_write_done dm crypt: store sector mapping in dm_crypt_io dm crypt: move queue functions ...
Diffstat (limited to 'drivers')
-rw-r--r--drivers/md/Kconfig24
-rw-r--r--drivers/md/dm-crypt.c486
-rw-r--r--drivers/md/dm-exception-store.c2
-rw-r--r--drivers/md/dm-ioctl.c32
-rw-r--r--drivers/md/dm-log.c51
-rw-r--r--drivers/md/dm-mpath.c2
-rw-r--r--drivers/md/dm-raid1.c664
-rw-r--r--drivers/md/dm-snap.c95
-rw-r--r--drivers/md/dm-snap.h50
-rw-r--r--drivers/md/dm-stripe.c105
-rw-r--r--drivers/md/dm-table.c20
-rw-r--r--drivers/md/dm.c238
12 files changed, 1361 insertions, 408 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 3fa7c77d9bd9..610af916891e 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -204,7 +204,7 @@ config BLK_DEV_DM
204 204
205config DM_DEBUG 205config DM_DEBUG
206 boolean "Device mapper debugging support" 206 boolean "Device mapper debugging support"
207 depends on BLK_DEV_DM && EXPERIMENTAL 207 depends on BLK_DEV_DM
208 ---help--- 208 ---help---
209 Enable this for messages that may help debug device-mapper problems. 209 Enable this for messages that may help debug device-mapper problems.
210 210
@@ -212,7 +212,7 @@ config DM_DEBUG
212 212
213config DM_CRYPT 213config DM_CRYPT
214 tristate "Crypt target support" 214 tristate "Crypt target support"
215 depends on BLK_DEV_DM && EXPERIMENTAL 215 depends on BLK_DEV_DM
216 select CRYPTO 216 select CRYPTO
217 select CRYPTO_CBC 217 select CRYPTO_CBC
218 ---help--- 218 ---help---
@@ -230,34 +230,34 @@ config DM_CRYPT
230 If unsure, say N. 230 If unsure, say N.
231 231
232config DM_SNAPSHOT 232config DM_SNAPSHOT
233 tristate "Snapshot target (EXPERIMENTAL)" 233 tristate "Snapshot target"
234 depends on BLK_DEV_DM && EXPERIMENTAL 234 depends on BLK_DEV_DM
235 ---help--- 235 ---help---
236 Allow volume managers to take writable snapshots of a device. 236 Allow volume managers to take writable snapshots of a device.
237 237
238config DM_MIRROR 238config DM_MIRROR
239 tristate "Mirror target (EXPERIMENTAL)" 239 tristate "Mirror target"
240 depends on BLK_DEV_DM && EXPERIMENTAL 240 depends on BLK_DEV_DM
241 ---help--- 241 ---help---
242 Allow volume managers to mirror logical volumes, also 242 Allow volume managers to mirror logical volumes, also
243 needed for live data migration tools such as 'pvmove'. 243 needed for live data migration tools such as 'pvmove'.
244 244
245config DM_ZERO 245config DM_ZERO
246 tristate "Zero target (EXPERIMENTAL)" 246 tristate "Zero target"
247 depends on BLK_DEV_DM && EXPERIMENTAL 247 depends on BLK_DEV_DM
248 ---help--- 248 ---help---
249 A target that discards writes, and returns all zeroes for 249 A target that discards writes, and returns all zeroes for
250 reads. Useful in some recovery situations. 250 reads. Useful in some recovery situations.
251 251
252config DM_MULTIPATH 252config DM_MULTIPATH
253 tristate "Multipath target (EXPERIMENTAL)" 253 tristate "Multipath target"
254 depends on BLK_DEV_DM && EXPERIMENTAL 254 depends on BLK_DEV_DM
255 ---help--- 255 ---help---
256 Allow volume managers to support multipath hardware. 256 Allow volume managers to support multipath hardware.
257 257
258config DM_MULTIPATH_EMC 258config DM_MULTIPATH_EMC
259 tristate "EMC CX/AX multipath support (EXPERIMENTAL)" 259 tristate "EMC CX/AX multipath support"
260 depends on DM_MULTIPATH && BLK_DEV_DM && EXPERIMENTAL 260 depends on DM_MULTIPATH && BLK_DEV_DM
261 ---help--- 261 ---help---
262 Multipath support for EMC CX/AX series hardware. 262 Multipath support for EMC CX/AX series hardware.
263 263
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 6b66ee46b87d..b04f98df94ea 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1,11 +1,12 @@
1/* 1/*
2 * Copyright (C) 2003 Christophe Saout <christophe@saout.de> 2 * Copyright (C) 2003 Christophe Saout <christophe@saout.de>
3 * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org> 3 * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org>
4 * Copyright (C) 2006 Red Hat, Inc. All rights reserved. 4 * Copyright (C) 2006-2007 Red Hat, Inc. All rights reserved.
5 * 5 *
6 * This file is released under the GPL. 6 * This file is released under the GPL.
7 */ 7 */
8 8
9#include <linux/completion.h>
9#include <linux/err.h> 10#include <linux/err.h>
10#include <linux/module.h> 11#include <linux/module.h>
11#include <linux/init.h> 12#include <linux/init.h>
@@ -28,20 +29,10 @@
28#define MESG_STR(x) x, sizeof(x) 29#define MESG_STR(x) x, sizeof(x)
29 30
30/* 31/*
31 * per bio private data
32 */
33struct dm_crypt_io {
34 struct dm_target *target;
35 struct bio *base_bio;
36 struct work_struct work;
37 atomic_t pending;
38 int error;
39};
40
41/*
42 * context holding the current state of a multi-part conversion 32 * context holding the current state of a multi-part conversion
43 */ 33 */
44struct convert_context { 34struct convert_context {
35 struct completion restart;
45 struct bio *bio_in; 36 struct bio *bio_in;
46 struct bio *bio_out; 37 struct bio *bio_out;
47 unsigned int offset_in; 38 unsigned int offset_in;
@@ -49,7 +40,27 @@ struct convert_context {
49 unsigned int idx_in; 40 unsigned int idx_in;
50 unsigned int idx_out; 41 unsigned int idx_out;
51 sector_t sector; 42 sector_t sector;
52 int write; 43 atomic_t pending;
44};
45
46/*
47 * per bio private data
48 */
49struct dm_crypt_io {
50 struct dm_target *target;
51 struct bio *base_bio;
52 struct work_struct work;
53
54 struct convert_context ctx;
55
56 atomic_t pending;
57 int error;
58 sector_t sector;
59};
60
61struct dm_crypt_request {
62 struct scatterlist sg_in;
63 struct scatterlist sg_out;
53}; 64};
54 65
55struct crypt_config; 66struct crypt_config;
@@ -72,10 +83,11 @@ struct crypt_config {
72 sector_t start; 83 sector_t start;
73 84
74 /* 85 /*
75 * pool for per bio private data and 86 * pool for per bio private data, crypto requests and
76 * for encryption buffer pages 87 * encryption requeusts/buffer pages
77 */ 88 */
78 mempool_t *io_pool; 89 mempool_t *io_pool;
90 mempool_t *req_pool;
79 mempool_t *page_pool; 91 mempool_t *page_pool;
80 struct bio_set *bs; 92 struct bio_set *bs;
81 93
@@ -93,9 +105,25 @@ struct crypt_config {
93 sector_t iv_offset; 105 sector_t iv_offset;
94 unsigned int iv_size; 106 unsigned int iv_size;
95 107
108 /*
109 * Layout of each crypto request:
110 *
111 * struct ablkcipher_request
112 * context
113 * padding
114 * struct dm_crypt_request
115 * padding
116 * IV
117 *
118 * The padding is added so that dm_crypt_request and the IV are
119 * correctly aligned.
120 */
121 unsigned int dmreq_start;
122 struct ablkcipher_request *req;
123
96 char cipher[CRYPTO_MAX_ALG_NAME]; 124 char cipher[CRYPTO_MAX_ALG_NAME];
97 char chainmode[CRYPTO_MAX_ALG_NAME]; 125 char chainmode[CRYPTO_MAX_ALG_NAME];
98 struct crypto_blkcipher *tfm; 126 struct crypto_ablkcipher *tfm;
99 unsigned long flags; 127 unsigned long flags;
100 unsigned int key_size; 128 unsigned int key_size;
101 u8 key[0]; 129 u8 key[0];
@@ -108,6 +136,7 @@ struct crypt_config {
108static struct kmem_cache *_crypt_io_pool; 136static struct kmem_cache *_crypt_io_pool;
109 137
110static void clone_init(struct dm_crypt_io *, struct bio *); 138static void clone_init(struct dm_crypt_io *, struct bio *);
139static void kcryptd_queue_crypt(struct dm_crypt_io *io);
111 140
112/* 141/*
113 * Different IV generation algorithms: 142 * Different IV generation algorithms:
@@ -188,7 +217,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
188 return PTR_ERR(essiv_tfm); 217 return PTR_ERR(essiv_tfm);
189 } 218 }
190 if (crypto_cipher_blocksize(essiv_tfm) != 219 if (crypto_cipher_blocksize(essiv_tfm) !=
191 crypto_blkcipher_ivsize(cc->tfm)) { 220 crypto_ablkcipher_ivsize(cc->tfm)) {
192 ti->error = "Block size of ESSIV cipher does " 221 ti->error = "Block size of ESSIV cipher does "
193 "not match IV size of block cipher"; 222 "not match IV size of block cipher";
194 crypto_free_cipher(essiv_tfm); 223 crypto_free_cipher(essiv_tfm);
@@ -225,7 +254,7 @@ static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
225static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti, 254static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti,
226 const char *opts) 255 const char *opts)
227{ 256{
228 unsigned int bs = crypto_blkcipher_blocksize(cc->tfm); 257 unsigned bs = crypto_ablkcipher_blocksize(cc->tfm);
229 int log = ilog2(bs); 258 int log = ilog2(bs);
230 259
231 /* we need to calculate how far we must shift the sector count 260 /* we need to calculate how far we must shift the sector count
@@ -289,42 +318,10 @@ static struct crypt_iv_operations crypt_iv_null_ops = {
289 .generator = crypt_iv_null_gen 318 .generator = crypt_iv_null_gen
290}; 319};
291 320
292static int
293crypt_convert_scatterlist(struct crypt_config *cc, struct scatterlist *out,
294 struct scatterlist *in, unsigned int length,
295 int write, sector_t sector)
296{
297 u8 iv[cc->iv_size] __attribute__ ((aligned(__alignof__(u64))));
298 struct blkcipher_desc desc = {
299 .tfm = cc->tfm,
300 .info = iv,
301 .flags = CRYPTO_TFM_REQ_MAY_SLEEP,
302 };
303 int r;
304
305 if (cc->iv_gen_ops) {
306 r = cc->iv_gen_ops->generator(cc, iv, sector);
307 if (r < 0)
308 return r;
309
310 if (write)
311 r = crypto_blkcipher_encrypt_iv(&desc, out, in, length);
312 else
313 r = crypto_blkcipher_decrypt_iv(&desc, out, in, length);
314 } else {
315 if (write)
316 r = crypto_blkcipher_encrypt(&desc, out, in, length);
317 else
318 r = crypto_blkcipher_decrypt(&desc, out, in, length);
319 }
320
321 return r;
322}
323
324static void crypt_convert_init(struct crypt_config *cc, 321static void crypt_convert_init(struct crypt_config *cc,
325 struct convert_context *ctx, 322 struct convert_context *ctx,
326 struct bio *bio_out, struct bio *bio_in, 323 struct bio *bio_out, struct bio *bio_in,
327 sector_t sector, int write) 324 sector_t sector)
328{ 325{
329 ctx->bio_in = bio_in; 326 ctx->bio_in = bio_in;
330 ctx->bio_out = bio_out; 327 ctx->bio_out = bio_out;
@@ -333,7 +330,79 @@ static void crypt_convert_init(struct crypt_config *cc,
333 ctx->idx_in = bio_in ? bio_in->bi_idx : 0; 330 ctx->idx_in = bio_in ? bio_in->bi_idx : 0;
334 ctx->idx_out = bio_out ? bio_out->bi_idx : 0; 331 ctx->idx_out = bio_out ? bio_out->bi_idx : 0;
335 ctx->sector = sector + cc->iv_offset; 332 ctx->sector = sector + cc->iv_offset;
336 ctx->write = write; 333 init_completion(&ctx->restart);
334 /*
335 * Crypto operation can be asynchronous,
336 * ctx->pending is increased after request submission.
337 * We need to ensure that we don't call the crypt finish
338 * operation before pending got incremented
339 * (dependent on crypt submission return code).
340 */
341 atomic_set(&ctx->pending, 2);
342}
343
344static int crypt_convert_block(struct crypt_config *cc,
345 struct convert_context *ctx,
346 struct ablkcipher_request *req)
347{
348 struct bio_vec *bv_in = bio_iovec_idx(ctx->bio_in, ctx->idx_in);
349 struct bio_vec *bv_out = bio_iovec_idx(ctx->bio_out, ctx->idx_out);
350 struct dm_crypt_request *dmreq;
351 u8 *iv;
352 int r = 0;
353
354 dmreq = (struct dm_crypt_request *)((char *)req + cc->dmreq_start);
355 iv = (u8 *)ALIGN((unsigned long)(dmreq + 1),
356 crypto_ablkcipher_alignmask(cc->tfm) + 1);
357
358 sg_init_table(&dmreq->sg_in, 1);
359 sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT,
360 bv_in->bv_offset + ctx->offset_in);
361
362 sg_init_table(&dmreq->sg_out, 1);
363 sg_set_page(&dmreq->sg_out, bv_out->bv_page, 1 << SECTOR_SHIFT,
364 bv_out->bv_offset + ctx->offset_out);
365
366 ctx->offset_in += 1 << SECTOR_SHIFT;
367 if (ctx->offset_in >= bv_in->bv_len) {
368 ctx->offset_in = 0;
369 ctx->idx_in++;
370 }
371
372 ctx->offset_out += 1 << SECTOR_SHIFT;
373 if (ctx->offset_out >= bv_out->bv_len) {
374 ctx->offset_out = 0;
375 ctx->idx_out++;
376 }
377
378 if (cc->iv_gen_ops) {
379 r = cc->iv_gen_ops->generator(cc, iv, ctx->sector);
380 if (r < 0)
381 return r;
382 }
383
384 ablkcipher_request_set_crypt(req, &dmreq->sg_in, &dmreq->sg_out,
385 1 << SECTOR_SHIFT, iv);
386
387 if (bio_data_dir(ctx->bio_in) == WRITE)
388 r = crypto_ablkcipher_encrypt(req);
389 else
390 r = crypto_ablkcipher_decrypt(req);
391
392 return r;
393}
394
395static void kcryptd_async_done(struct crypto_async_request *async_req,
396 int error);
397static void crypt_alloc_req(struct crypt_config *cc,
398 struct convert_context *ctx)
399{
400 if (!cc->req)
401 cc->req = mempool_alloc(cc->req_pool, GFP_NOIO);
402 ablkcipher_request_set_tfm(cc->req, cc->tfm);
403 ablkcipher_request_set_callback(cc->req, CRYPTO_TFM_REQ_MAY_BACKLOG |
404 CRYPTO_TFM_REQ_MAY_SLEEP,
405 kcryptd_async_done, ctx);
337} 406}
338 407
339/* 408/*
@@ -346,36 +415,38 @@ static int crypt_convert(struct crypt_config *cc,
346 415
347 while(ctx->idx_in < ctx->bio_in->bi_vcnt && 416 while(ctx->idx_in < ctx->bio_in->bi_vcnt &&
348 ctx->idx_out < ctx->bio_out->bi_vcnt) { 417 ctx->idx_out < ctx->bio_out->bi_vcnt) {
349 struct bio_vec *bv_in = bio_iovec_idx(ctx->bio_in, ctx->idx_in);
350 struct bio_vec *bv_out = bio_iovec_idx(ctx->bio_out, ctx->idx_out);
351 struct scatterlist sg_in, sg_out;
352
353 sg_init_table(&sg_in, 1);
354 sg_set_page(&sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT, bv_in->bv_offset + ctx->offset_in);
355
356 sg_init_table(&sg_out, 1);
357 sg_set_page(&sg_out, bv_out->bv_page, 1 << SECTOR_SHIFT, bv_out->bv_offset + ctx->offset_out);
358 418
359 ctx->offset_in += sg_in.length; 419 crypt_alloc_req(cc, ctx);
360 if (ctx->offset_in >= bv_in->bv_len) { 420
361 ctx->offset_in = 0; 421 r = crypt_convert_block(cc, ctx, cc->req);
362 ctx->idx_in++; 422
423 switch (r) {
424 case -EBUSY:
425 wait_for_completion(&ctx->restart);
426 INIT_COMPLETION(ctx->restart);
427 /* fall through*/
428 case -EINPROGRESS:
429 atomic_inc(&ctx->pending);
430 cc->req = NULL;
431 r = 0;
432 /* fall through*/
433 case 0:
434 ctx->sector++;
435 continue;
363 } 436 }
364 437
365 ctx->offset_out += sg_out.length; 438 break;
366 if (ctx->offset_out >= bv_out->bv_len) {
367 ctx->offset_out = 0;
368 ctx->idx_out++;
369 }
370
371 r = crypt_convert_scatterlist(cc, &sg_out, &sg_in, sg_in.length,
372 ctx->write, ctx->sector);
373 if (r < 0)
374 break;
375
376 ctx->sector++;
377 } 439 }
378 440
441 /*
442 * If there are pending crypto operation run async
443 * code. Otherwise process return code synchronously.
444 * The step of 2 ensures that async finish doesn't
445 * call crypto finish too early.
446 */
447 if (atomic_sub_return(2, &ctx->pending))
448 return -EINPROGRESS;
449
379 return r; 450 return r;
380} 451}
381 452
@@ -455,18 +526,14 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone)
455 * One of the bios was finished. Check for completion of 526 * One of the bios was finished. Check for completion of
456 * the whole request and correctly clean up the buffer. 527 * the whole request and correctly clean up the buffer.
457 */ 528 */
458static void crypt_dec_pending(struct dm_crypt_io *io, int error) 529static void crypt_dec_pending(struct dm_crypt_io *io)
459{ 530{
460 struct crypt_config *cc = (struct crypt_config *) io->target->private; 531 struct crypt_config *cc = io->target->private;
461
462 if (error < 0)
463 io->error = error;
464 532
465 if (!atomic_dec_and_test(&io->pending)) 533 if (!atomic_dec_and_test(&io->pending))
466 return; 534 return;
467 535
468 bio_endio(io->base_bio, io->error); 536 bio_endio(io->base_bio, io->error);
469
470 mempool_free(io, cc->io_pool); 537 mempool_free(io, cc->io_pool);
471} 538}
472 539
@@ -484,30 +551,11 @@ static void crypt_dec_pending(struct dm_crypt_io *io, int error)
484 * starved by new requests which can block in the first stages due 551 * starved by new requests which can block in the first stages due
485 * to memory allocation. 552 * to memory allocation.
486 */ 553 */
487static void kcryptd_do_work(struct work_struct *work);
488static void kcryptd_do_crypt(struct work_struct *work);
489
490static void kcryptd_queue_io(struct dm_crypt_io *io)
491{
492 struct crypt_config *cc = io->target->private;
493
494 INIT_WORK(&io->work, kcryptd_do_work);
495 queue_work(cc->io_queue, &io->work);
496}
497
498static void kcryptd_queue_crypt(struct dm_crypt_io *io)
499{
500 struct crypt_config *cc = io->target->private;
501
502 INIT_WORK(&io->work, kcryptd_do_crypt);
503 queue_work(cc->crypt_queue, &io->work);
504}
505
506static void crypt_endio(struct bio *clone, int error) 554static void crypt_endio(struct bio *clone, int error)
507{ 555{
508 struct dm_crypt_io *io = clone->bi_private; 556 struct dm_crypt_io *io = clone->bi_private;
509 struct crypt_config *cc = io->target->private; 557 struct crypt_config *cc = io->target->private;
510 unsigned read_io = bio_data_dir(clone) == READ; 558 unsigned rw = bio_data_dir(clone);
511 559
512 if (unlikely(!bio_flagged(clone, BIO_UPTODATE) && !error)) 560 if (unlikely(!bio_flagged(clone, BIO_UPTODATE) && !error))
513 error = -EIO; 561 error = -EIO;
@@ -515,21 +563,20 @@ static void crypt_endio(struct bio *clone, int error)
515 /* 563 /*
516 * free the processed pages 564 * free the processed pages
517 */ 565 */
518 if (!read_io) { 566 if (rw == WRITE)
519 crypt_free_buffer_pages(cc, clone); 567 crypt_free_buffer_pages(cc, clone);
520 goto out; 568
569 bio_put(clone);
570
571 if (rw == READ && !error) {
572 kcryptd_queue_crypt(io);
573 return;
521 } 574 }
522 575
523 if (unlikely(error)) 576 if (unlikely(error))
524 goto out; 577 io->error = error;
525
526 bio_put(clone);
527 kcryptd_queue_crypt(io);
528 return;
529 578
530out: 579 crypt_dec_pending(io);
531 bio_put(clone);
532 crypt_dec_pending(io, error);
533} 580}
534 581
535static void clone_init(struct dm_crypt_io *io, struct bio *clone) 582static void clone_init(struct dm_crypt_io *io, struct bio *clone)
@@ -543,12 +590,11 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone)
543 clone->bi_destructor = dm_crypt_bio_destructor; 590 clone->bi_destructor = dm_crypt_bio_destructor;
544} 591}
545 592
546static void process_read(struct dm_crypt_io *io) 593static void kcryptd_io_read(struct dm_crypt_io *io)
547{ 594{
548 struct crypt_config *cc = io->target->private; 595 struct crypt_config *cc = io->target->private;
549 struct bio *base_bio = io->base_bio; 596 struct bio *base_bio = io->base_bio;
550 struct bio *clone; 597 struct bio *clone;
551 sector_t sector = base_bio->bi_sector - io->target->begin;
552 598
553 atomic_inc(&io->pending); 599 atomic_inc(&io->pending);
554 600
@@ -559,7 +605,8 @@ static void process_read(struct dm_crypt_io *io)
559 */ 605 */
560 clone = bio_alloc_bioset(GFP_NOIO, bio_segments(base_bio), cc->bs); 606 clone = bio_alloc_bioset(GFP_NOIO, bio_segments(base_bio), cc->bs);
561 if (unlikely(!clone)) { 607 if (unlikely(!clone)) {
562 crypt_dec_pending(io, -ENOMEM); 608 io->error = -ENOMEM;
609 crypt_dec_pending(io);
563 return; 610 return;
564 } 611 }
565 612
@@ -567,25 +614,71 @@ static void process_read(struct dm_crypt_io *io)
567 clone->bi_idx = 0; 614 clone->bi_idx = 0;
568 clone->bi_vcnt = bio_segments(base_bio); 615 clone->bi_vcnt = bio_segments(base_bio);
569 clone->bi_size = base_bio->bi_size; 616 clone->bi_size = base_bio->bi_size;
570 clone->bi_sector = cc->start + sector; 617 clone->bi_sector = cc->start + io->sector;
571 memcpy(clone->bi_io_vec, bio_iovec(base_bio), 618 memcpy(clone->bi_io_vec, bio_iovec(base_bio),
572 sizeof(struct bio_vec) * clone->bi_vcnt); 619 sizeof(struct bio_vec) * clone->bi_vcnt);
573 620
574 generic_make_request(clone); 621 generic_make_request(clone);
575} 622}
576 623
577static void process_write(struct dm_crypt_io *io) 624static void kcryptd_io_write(struct dm_crypt_io *io)
625{
626 struct bio *clone = io->ctx.bio_out;
627
628 generic_make_request(clone);
629}
630
631static void kcryptd_io(struct work_struct *work)
632{
633 struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work);
634
635 if (bio_data_dir(io->base_bio) == READ)
636 kcryptd_io_read(io);
637 else
638 kcryptd_io_write(io);
639}
640
641static void kcryptd_queue_io(struct dm_crypt_io *io)
578{ 642{
579 struct crypt_config *cc = io->target->private; 643 struct crypt_config *cc = io->target->private;
580 struct bio *base_bio = io->base_bio;
581 struct bio *clone;
582 struct convert_context ctx;
583 unsigned remaining = base_bio->bi_size;
584 sector_t sector = base_bio->bi_sector - io->target->begin;
585 644
586 atomic_inc(&io->pending); 645 INIT_WORK(&io->work, kcryptd_io);
646 queue_work(cc->io_queue, &io->work);
647}
587 648
588 crypt_convert_init(cc, &ctx, NULL, base_bio, sector, 1); 649static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io,
650 int error, int async)
651{
652 struct bio *clone = io->ctx.bio_out;
653 struct crypt_config *cc = io->target->private;
654
655 if (unlikely(error < 0)) {
656 crypt_free_buffer_pages(cc, clone);
657 bio_put(clone);
658 io->error = -EIO;
659 return;
660 }
661
662 /* crypt_convert should have filled the clone bio */
663 BUG_ON(io->ctx.idx_out < clone->bi_vcnt);
664
665 clone->bi_sector = cc->start + io->sector;
666 io->sector += bio_sectors(clone);
667
668 if (async)
669 kcryptd_queue_io(io);
670 else {
671 atomic_inc(&io->pending);
672 generic_make_request(clone);
673 }
674}
675
676static void kcryptd_crypt_write_convert_loop(struct dm_crypt_io *io)
677{
678 struct crypt_config *cc = io->target->private;
679 struct bio *clone;
680 unsigned remaining = io->base_bio->bi_size;
681 int r;
589 682
590 /* 683 /*
591 * The allocated buffers can be smaller than the whole bio, 684 * The allocated buffers can be smaller than the whole bio,
@@ -594,70 +687,110 @@ static void process_write(struct dm_crypt_io *io)
594 while (remaining) { 687 while (remaining) {
595 clone = crypt_alloc_buffer(io, remaining); 688 clone = crypt_alloc_buffer(io, remaining);
596 if (unlikely(!clone)) { 689 if (unlikely(!clone)) {
597 crypt_dec_pending(io, -ENOMEM); 690 io->error = -ENOMEM;
598 return; 691 return;
599 } 692 }
600 693
601 ctx.bio_out = clone; 694 io->ctx.bio_out = clone;
602 ctx.idx_out = 0; 695 io->ctx.idx_out = 0;
603 696
604 if (unlikely(crypt_convert(cc, &ctx) < 0)) {
605 crypt_free_buffer_pages(cc, clone);
606 bio_put(clone);
607 crypt_dec_pending(io, -EIO);
608 return;
609 }
610
611 /* crypt_convert should have filled the clone bio */
612 BUG_ON(ctx.idx_out < clone->bi_vcnt);
613
614 clone->bi_sector = cc->start + sector;
615 remaining -= clone->bi_size; 697 remaining -= clone->bi_size;
616 sector += bio_sectors(clone);
617 698
618 /* Grab another reference to the io struct 699 r = crypt_convert(cc, &io->ctx);
619 * before we kick off the request */
620 if (remaining)
621 atomic_inc(&io->pending);
622 700
623 generic_make_request(clone); 701 if (r != -EINPROGRESS) {
624 702 kcryptd_crypt_write_io_submit(io, r, 0);
625 /* Do not reference clone after this - it 703 if (unlikely(r < 0))
626 * may be gone already. */ 704 return;
705 } else
706 atomic_inc(&io->pending);
627 707
628 /* out of memory -> run queues */ 708 /* out of memory -> run queues */
629 if (remaining) 709 if (unlikely(remaining))
630 congestion_wait(WRITE, HZ/100); 710 congestion_wait(WRITE, HZ/100);
631 } 711 }
632} 712}
633 713
634static void process_read_endio(struct dm_crypt_io *io) 714static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
635{ 715{
636 struct crypt_config *cc = io->target->private; 716 struct crypt_config *cc = io->target->private;
637 struct convert_context ctx;
638 717
639 crypt_convert_init(cc, &ctx, io->base_bio, io->base_bio, 718 /*
640 io->base_bio->bi_sector - io->target->begin, 0); 719 * Prevent io from disappearing until this function completes.
720 */
721 atomic_inc(&io->pending);
722
723 crypt_convert_init(cc, &io->ctx, NULL, io->base_bio, io->sector);
724 kcryptd_crypt_write_convert_loop(io);
641 725
642 crypt_dec_pending(io, crypt_convert(cc, &ctx)); 726 crypt_dec_pending(io);
643} 727}
644 728
645static void kcryptd_do_work(struct work_struct *work) 729static void kcryptd_crypt_read_done(struct dm_crypt_io *io, int error)
646{ 730{
647 struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work); 731 if (unlikely(error < 0))
732 io->error = -EIO;
733
734 crypt_dec_pending(io);
735}
736
737static void kcryptd_crypt_read_convert(struct dm_crypt_io *io)
738{
739 struct crypt_config *cc = io->target->private;
740 int r = 0;
741
742 atomic_inc(&io->pending);
743
744 crypt_convert_init(cc, &io->ctx, io->base_bio, io->base_bio,
745 io->sector);
746
747 r = crypt_convert(cc, &io->ctx);
748
749 if (r != -EINPROGRESS)
750 kcryptd_crypt_read_done(io, r);
751
752 crypt_dec_pending(io);
753}
754
755static void kcryptd_async_done(struct crypto_async_request *async_req,
756 int error)
757{
758 struct convert_context *ctx = async_req->data;
759 struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx);
760 struct crypt_config *cc = io->target->private;
761
762 if (error == -EINPROGRESS) {
763 complete(&ctx->restart);
764 return;
765 }
766
767 mempool_free(ablkcipher_request_cast(async_req), cc->req_pool);
768
769 if (!atomic_dec_and_test(&ctx->pending))
770 return;
648 771
649 if (bio_data_dir(io->base_bio) == READ) 772 if (bio_data_dir(io->base_bio) == READ)
650 process_read(io); 773 kcryptd_crypt_read_done(io, error);
774 else
775 kcryptd_crypt_write_io_submit(io, error, 1);
651} 776}
652 777
653static void kcryptd_do_crypt(struct work_struct *work) 778static void kcryptd_crypt(struct work_struct *work)
654{ 779{
655 struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work); 780 struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work);
656 781
657 if (bio_data_dir(io->base_bio) == READ) 782 if (bio_data_dir(io->base_bio) == READ)
658 process_read_endio(io); 783 kcryptd_crypt_read_convert(io);
659 else 784 else
660 process_write(io); 785 kcryptd_crypt_write_convert(io);
786}
787
788static void kcryptd_queue_crypt(struct dm_crypt_io *io)
789{
790 struct crypt_config *cc = io->target->private;
791
792 INIT_WORK(&io->work, kcryptd_crypt);
793 queue_work(cc->crypt_queue, &io->work);
661} 794}
662 795
663/* 796/*
@@ -733,7 +866,7 @@ static int crypt_wipe_key(struct crypt_config *cc)
733static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) 866static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
734{ 867{
735 struct crypt_config *cc; 868 struct crypt_config *cc;
736 struct crypto_blkcipher *tfm; 869 struct crypto_ablkcipher *tfm;
737 char *tmp; 870 char *tmp;
738 char *cipher; 871 char *cipher;
739 char *chainmode; 872 char *chainmode;
@@ -787,7 +920,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
787 goto bad_cipher; 920 goto bad_cipher;
788 } 921 }
789 922
790 tfm = crypto_alloc_blkcipher(cc->cipher, 0, CRYPTO_ALG_ASYNC); 923 tfm = crypto_alloc_ablkcipher(cc->cipher, 0, 0);
791 if (IS_ERR(tfm)) { 924 if (IS_ERR(tfm)) {
792 ti->error = "Error allocating crypto tfm"; 925 ti->error = "Error allocating crypto tfm";
793 goto bad_cipher; 926 goto bad_cipher;
@@ -821,7 +954,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
821 cc->iv_gen_ops->ctr(cc, ti, ivopts) < 0) 954 cc->iv_gen_ops->ctr(cc, ti, ivopts) < 0)
822 goto bad_ivmode; 955 goto bad_ivmode;
823 956
824 cc->iv_size = crypto_blkcipher_ivsize(tfm); 957 cc->iv_size = crypto_ablkcipher_ivsize(tfm);
825 if (cc->iv_size) 958 if (cc->iv_size)
826 /* at least a 64 bit sector number should fit in our buffer */ 959 /* at least a 64 bit sector number should fit in our buffer */
827 cc->iv_size = max(cc->iv_size, 960 cc->iv_size = max(cc->iv_size,
@@ -841,6 +974,20 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
841 goto bad_slab_pool; 974 goto bad_slab_pool;
842 } 975 }
843 976
977 cc->dmreq_start = sizeof(struct ablkcipher_request);
978 cc->dmreq_start += crypto_ablkcipher_reqsize(tfm);
979 cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment());
980 cc->dmreq_start += crypto_ablkcipher_alignmask(tfm) &
981 ~(crypto_tfm_ctx_alignment() - 1);
982
983 cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start +
984 sizeof(struct dm_crypt_request) + cc->iv_size);
985 if (!cc->req_pool) {
986 ti->error = "Cannot allocate crypt request mempool";
987 goto bad_req_pool;
988 }
989 cc->req = NULL;
990
844 cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0); 991 cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0);
845 if (!cc->page_pool) { 992 if (!cc->page_pool) {
846 ti->error = "Cannot allocate page mempool"; 993 ti->error = "Cannot allocate page mempool";
@@ -853,7 +1000,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
853 goto bad_bs; 1000 goto bad_bs;
854 } 1001 }
855 1002
856 if (crypto_blkcipher_setkey(tfm, cc->key, key_size) < 0) { 1003 if (crypto_ablkcipher_setkey(tfm, cc->key, key_size) < 0) {
857 ti->error = "Error setting key"; 1004 ti->error = "Error setting key";
858 goto bad_device; 1005 goto bad_device;
859 } 1006 }
@@ -914,12 +1061,14 @@ bad_device:
914bad_bs: 1061bad_bs:
915 mempool_destroy(cc->page_pool); 1062 mempool_destroy(cc->page_pool);
916bad_page_pool: 1063bad_page_pool:
1064 mempool_destroy(cc->req_pool);
1065bad_req_pool:
917 mempool_destroy(cc->io_pool); 1066 mempool_destroy(cc->io_pool);
918bad_slab_pool: 1067bad_slab_pool:
919 if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) 1068 if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
920 cc->iv_gen_ops->dtr(cc); 1069 cc->iv_gen_ops->dtr(cc);
921bad_ivmode: 1070bad_ivmode:
922 crypto_free_blkcipher(tfm); 1071 crypto_free_ablkcipher(tfm);
923bad_cipher: 1072bad_cipher:
924 /* Must zero key material before freeing */ 1073 /* Must zero key material before freeing */
925 memset(cc, 0, sizeof(*cc) + cc->key_size * sizeof(u8)); 1074 memset(cc, 0, sizeof(*cc) + cc->key_size * sizeof(u8));
@@ -934,14 +1083,18 @@ static void crypt_dtr(struct dm_target *ti)
934 destroy_workqueue(cc->io_queue); 1083 destroy_workqueue(cc->io_queue);
935 destroy_workqueue(cc->crypt_queue); 1084 destroy_workqueue(cc->crypt_queue);
936 1085
1086 if (cc->req)
1087 mempool_free(cc->req, cc->req_pool);
1088
937 bioset_free(cc->bs); 1089 bioset_free(cc->bs);
938 mempool_destroy(cc->page_pool); 1090 mempool_destroy(cc->page_pool);
1091 mempool_destroy(cc->req_pool);
939 mempool_destroy(cc->io_pool); 1092 mempool_destroy(cc->io_pool);
940 1093
941 kfree(cc->iv_mode); 1094 kfree(cc->iv_mode);
942 if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) 1095 if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
943 cc->iv_gen_ops->dtr(cc); 1096 cc->iv_gen_ops->dtr(cc);
944 crypto_free_blkcipher(cc->tfm); 1097 crypto_free_ablkcipher(cc->tfm);
945 dm_put_device(ti, cc->dev); 1098 dm_put_device(ti, cc->dev);
946 1099
947 /* Must zero key material before freeing */ 1100 /* Must zero key material before freeing */
@@ -958,6 +1111,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
958 io = mempool_alloc(cc->io_pool, GFP_NOIO); 1111 io = mempool_alloc(cc->io_pool, GFP_NOIO);
959 io->target = ti; 1112 io->target = ti;
960 io->base_bio = bio; 1113 io->base_bio = bio;
1114 io->sector = bio->bi_sector - ti->begin;
961 io->error = 0; 1115 io->error = 0;
962 atomic_set(&io->pending, 0); 1116 atomic_set(&io->pending, 0);
963 1117
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index 8fe81e1807e0..5bbce29f143a 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -449,7 +449,7 @@ static void persistent_destroy(struct exception_store *store)
449 449
450static int persistent_read_metadata(struct exception_store *store) 450static int persistent_read_metadata(struct exception_store *store)
451{ 451{
452 int r, new_snapshot; 452 int r, uninitialized_var(new_snapshot);
453 struct pstore *ps = get_info(store); 453 struct pstore *ps = get_info(store);
454 454
455 /* 455 /*
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 9627fa0f9470..b262c0042de3 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -15,6 +15,7 @@
15#include <linux/slab.h> 15#include <linux/slab.h>
16#include <linux/dm-ioctl.h> 16#include <linux/dm-ioctl.h>
17#include <linux/hdreg.h> 17#include <linux/hdreg.h>
18#include <linux/compat.h>
18 19
19#include <asm/uaccess.h> 20#include <asm/uaccess.h>
20 21
@@ -702,7 +703,7 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size)
702 int r; 703 int r;
703 char *new_name = (char *) param + param->data_start; 704 char *new_name = (char *) param + param->data_start;
704 705
705 if (new_name < (char *) param->data || 706 if (new_name < param->data ||
706 invalid_str(new_name, (void *) param + param_size)) { 707 invalid_str(new_name, (void *) param + param_size)) {
707 DMWARN("Invalid new logical volume name supplied."); 708 DMWARN("Invalid new logical volume name supplied.");
708 return -EINVAL; 709 return -EINVAL;
@@ -728,7 +729,7 @@ static int dev_set_geometry(struct dm_ioctl *param, size_t param_size)
728 if (!md) 729 if (!md)
729 return -ENXIO; 730 return -ENXIO;
730 731
731 if (geostr < (char *) param->data || 732 if (geostr < param->data ||
732 invalid_str(geostr, (void *) param + param_size)) { 733 invalid_str(geostr, (void *) param + param_size)) {
733 DMWARN("Invalid geometry supplied."); 734 DMWARN("Invalid geometry supplied.");
734 goto out; 735 goto out;
@@ -1350,10 +1351,10 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param)
1350{ 1351{
1351 struct dm_ioctl tmp, *dmi; 1352 struct dm_ioctl tmp, *dmi;
1352 1353
1353 if (copy_from_user(&tmp, user, sizeof(tmp))) 1354 if (copy_from_user(&tmp, user, sizeof(tmp) - sizeof(tmp.data)))
1354 return -EFAULT; 1355 return -EFAULT;
1355 1356
1356 if (tmp.data_size < sizeof(tmp)) 1357 if (tmp.data_size < (sizeof(tmp) - sizeof(tmp.data)))
1357 return -EINVAL; 1358 return -EINVAL;
1358 1359
1359 dmi = vmalloc(tmp.data_size); 1360 dmi = vmalloc(tmp.data_size);
@@ -1397,13 +1398,11 @@ static int validate_params(uint cmd, struct dm_ioctl *param)
1397 return 0; 1398 return 0;
1398} 1399}
1399 1400
1400static int ctl_ioctl(struct inode *inode, struct file *file, 1401static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
1401 uint command, ulong u)
1402{ 1402{
1403 int r = 0; 1403 int r = 0;
1404 unsigned int cmd; 1404 unsigned int cmd;
1405 struct dm_ioctl *param; 1405 struct dm_ioctl *uninitialized_var(param);
1406 struct dm_ioctl __user *user = (struct dm_ioctl __user *) u;
1407 ioctl_fn fn = NULL; 1406 ioctl_fn fn = NULL;
1408 size_t param_size; 1407 size_t param_size;
1409 1408
@@ -1471,8 +1470,23 @@ static int ctl_ioctl(struct inode *inode, struct file *file,
1471 return r; 1470 return r;
1472} 1471}
1473 1472
1473static long dm_ctl_ioctl(struct file *file, uint command, ulong u)
1474{
1475 return (long)ctl_ioctl(command, (struct dm_ioctl __user *)u);
1476}
1477
1478#ifdef CONFIG_COMPAT
1479static long dm_compat_ctl_ioctl(struct file *file, uint command, ulong u)
1480{
1481 return (long)dm_ctl_ioctl(file, command, (ulong) compat_ptr(u));
1482}
1483#else
1484#define dm_compat_ctl_ioctl NULL
1485#endif
1486
1474static const struct file_operations _ctl_fops = { 1487static const struct file_operations _ctl_fops = {
1475 .ioctl = ctl_ioctl, 1488 .unlocked_ioctl = dm_ctl_ioctl,
1489 .compat_ioctl = dm_compat_ctl_ioctl,
1476 .owner = THIS_MODULE, 1490 .owner = THIS_MODULE,
1477}; 1491};
1478 1492
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 072ee4353eab..2a74b2142f50 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -41,7 +41,7 @@ int dm_unregister_dirty_log_type(struct dirty_log_type *type)
41 return 0; 41 return 0;
42} 42}
43 43
44static struct dirty_log_type *get_type(const char *type_name) 44static struct dirty_log_type *_get_type(const char *type_name)
45{ 45{
46 struct dirty_log_type *type; 46 struct dirty_log_type *type;
47 47
@@ -61,6 +61,55 @@ static struct dirty_log_type *get_type(const char *type_name)
61 return NULL; 61 return NULL;
62} 62}
63 63
64/*
65 * get_type
66 * @type_name
67 *
68 * Attempt to retrieve the dirty_log_type by name. If not already
69 * available, attempt to load the appropriate module.
70 *
71 * Log modules are named "dm-log-" followed by the 'type_name'.
72 * Modules may contain multiple types.
73 * This function will first try the module "dm-log-<type_name>",
74 * then truncate 'type_name' on the last '-' and try again.
75 *
76 * For example, if type_name was "clustered-disk", it would search
77 * 'dm-log-clustered-disk' then 'dm-log-clustered'.
78 *
79 * Returns: dirty_log_type* on success, NULL on failure
80 */
81static struct dirty_log_type *get_type(const char *type_name)
82{
83 char *p, *type_name_dup;
84 struct dirty_log_type *type;
85
86 type = _get_type(type_name);
87 if (type)
88 return type;
89
90 type_name_dup = kstrdup(type_name, GFP_KERNEL);
91 if (!type_name_dup) {
92 DMWARN("No memory left to attempt log module load for \"%s\"",
93 type_name);
94 return NULL;
95 }
96
97 while (request_module("dm-log-%s", type_name_dup) ||
98 !(type = _get_type(type_name))) {
99 p = strrchr(type_name_dup, '-');
100 if (!p)
101 break;
102 p[0] = '\0';
103 }
104
105 if (!type)
106 DMWARN("Module for logging type \"%s\" not found.", type_name);
107
108 kfree(type_name_dup);
109
110 return type;
111}
112
64static void put_type(struct dirty_log_type *type) 113static void put_type(struct dirty_log_type *type)
65{ 114{
66 spin_lock(&_lock); 115 spin_lock(&_lock);
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 24b2b1e32fae..e7ee59e655d5 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -106,7 +106,7 @@ typedef int (*action_fn) (struct pgpath *pgpath);
106 106
107static struct kmem_cache *_mpio_cache; 107static struct kmem_cache *_mpio_cache;
108 108
109struct workqueue_struct *kmultipathd; 109static struct workqueue_struct *kmultipathd;
110static void process_queued_ios(struct work_struct *work); 110static void process_queued_ios(struct work_struct *work);
111static void trigger_event(struct work_struct *work); 111static void trigger_event(struct work_struct *work);
112 112
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 31123d4a6b9c..edc057f5cdcc 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -6,6 +6,7 @@
6 6
7#include "dm.h" 7#include "dm.h"
8#include "dm-bio-list.h" 8#include "dm-bio-list.h"
9#include "dm-bio-record.h"
9#include "dm-io.h" 10#include "dm-io.h"
10#include "dm-log.h" 11#include "dm-log.h"
11#include "kcopyd.h" 12#include "kcopyd.h"
@@ -20,6 +21,7 @@
20#include <linux/vmalloc.h> 21#include <linux/vmalloc.h>
21#include <linux/workqueue.h> 22#include <linux/workqueue.h>
22#include <linux/log2.h> 23#include <linux/log2.h>
24#include <linux/hardirq.h>
23 25
24#define DM_MSG_PREFIX "raid1" 26#define DM_MSG_PREFIX "raid1"
25#define DM_IO_PAGES 64 27#define DM_IO_PAGES 64
@@ -113,9 +115,16 @@ struct region {
113/*----------------------------------------------------------------- 115/*-----------------------------------------------------------------
114 * Mirror set structures. 116 * Mirror set structures.
115 *---------------------------------------------------------------*/ 117 *---------------------------------------------------------------*/
118enum dm_raid1_error {
119 DM_RAID1_WRITE_ERROR,
120 DM_RAID1_SYNC_ERROR,
121 DM_RAID1_READ_ERROR
122};
123
116struct mirror { 124struct mirror {
117 struct mirror_set *ms; 125 struct mirror_set *ms;
118 atomic_t error_count; 126 atomic_t error_count;
127 uint32_t error_type;
119 struct dm_dev *dev; 128 struct dm_dev *dev;
120 sector_t offset; 129 sector_t offset;
121}; 130};
@@ -127,21 +136,25 @@ struct mirror_set {
127 struct kcopyd_client *kcopyd_client; 136 struct kcopyd_client *kcopyd_client;
128 uint64_t features; 137 uint64_t features;
129 138
130 spinlock_t lock; /* protects the next two lists */ 139 spinlock_t lock; /* protects the lists */
131 struct bio_list reads; 140 struct bio_list reads;
132 struct bio_list writes; 141 struct bio_list writes;
142 struct bio_list failures;
133 143
134 struct dm_io_client *io_client; 144 struct dm_io_client *io_client;
145 mempool_t *read_record_pool;
135 146
136 /* recovery */ 147 /* recovery */
137 region_t nr_regions; 148 region_t nr_regions;
138 int in_sync; 149 int in_sync;
139 int log_failure; 150 int log_failure;
151 atomic_t suspend;
140 152
141 struct mirror *default_mirror; /* Default mirror */ 153 atomic_t default_mirror; /* Default mirror */
142 154
143 struct workqueue_struct *kmirrord_wq; 155 struct workqueue_struct *kmirrord_wq;
144 struct work_struct kmirrord_work; 156 struct work_struct kmirrord_work;
157 struct work_struct trigger_event;
145 158
146 unsigned int nr_mirrors; 159 unsigned int nr_mirrors;
147 struct mirror mirror[0]; 160 struct mirror mirror[0];
@@ -362,6 +375,16 @@ static void complete_resync_work(struct region *reg, int success)
362 struct region_hash *rh = reg->rh; 375 struct region_hash *rh = reg->rh;
363 376
364 rh->log->type->set_region_sync(rh->log, reg->key, success); 377 rh->log->type->set_region_sync(rh->log, reg->key, success);
378
379 /*
380 * Dispatch the bios before we call 'wake_up_all'.
381 * This is important because if we are suspending,
382 * we want to know that recovery is complete and
383 * the work queue is flushed. If we wake_up_all
384 * before we dispatch_bios (queue bios and call wake()),
385 * then we risk suspending before the work queue
386 * has been properly flushed.
387 */
365 dispatch_bios(rh->ms, &reg->delayed_bios); 388 dispatch_bios(rh->ms, &reg->delayed_bios);
366 if (atomic_dec_and_test(&rh->recovery_in_flight)) 389 if (atomic_dec_and_test(&rh->recovery_in_flight))
367 wake_up_all(&_kmirrord_recovery_stopped); 390 wake_up_all(&_kmirrord_recovery_stopped);
@@ -626,24 +649,101 @@ static void rh_start_recovery(struct region_hash *rh)
626 wake(rh->ms); 649 wake(rh->ms);
627} 650}
628 651
652#define MIN_READ_RECORDS 20
653struct dm_raid1_read_record {
654 struct mirror *m;
655 struct dm_bio_details details;
656};
657
629/* 658/*
630 * Every mirror should look like this one. 659 * Every mirror should look like this one.
631 */ 660 */
632#define DEFAULT_MIRROR 0 661#define DEFAULT_MIRROR 0
633 662
634/* 663/*
635 * This is yucky. We squirrel the mirror_set struct away inside 664 * This is yucky. We squirrel the mirror struct away inside
636 * bi_next for write buffers. This is safe since the bh 665 * bi_next for read/write buffers. This is safe since the bh
637 * doesn't get submitted to the lower levels of block layer. 666 * doesn't get submitted to the lower levels of block layer.
638 */ 667 */
639static struct mirror_set *bio_get_ms(struct bio *bio) 668static struct mirror *bio_get_m(struct bio *bio)
669{
670 return (struct mirror *) bio->bi_next;
671}
672
673static void bio_set_m(struct bio *bio, struct mirror *m)
674{
675 bio->bi_next = (struct bio *) m;
676}
677
678static struct mirror *get_default_mirror(struct mirror_set *ms)
640{ 679{
641 return (struct mirror_set *) bio->bi_next; 680 return &ms->mirror[atomic_read(&ms->default_mirror)];
642} 681}
643 682
644static void bio_set_ms(struct bio *bio, struct mirror_set *ms) 683static void set_default_mirror(struct mirror *m)
645{ 684{
646 bio->bi_next = (struct bio *) ms; 685 struct mirror_set *ms = m->ms;
686 struct mirror *m0 = &(ms->mirror[0]);
687
688 atomic_set(&ms->default_mirror, m - m0);
689}
690
691/* fail_mirror
692 * @m: mirror device to fail
693 * @error_type: one of the enum's, DM_RAID1_*_ERROR
694 *
695 * If errors are being handled, record the type of
696 * error encountered for this device. If this type
697 * of error has already been recorded, we can return;
698 * otherwise, we must signal userspace by triggering
699 * an event. Additionally, if the device is the
700 * primary device, we must choose a new primary, but
701 * only if the mirror is in-sync.
702 *
703 * This function must not block.
704 */
705static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type)
706{
707 struct mirror_set *ms = m->ms;
708 struct mirror *new;
709
710 if (!errors_handled(ms))
711 return;
712
713 /*
714 * error_count is used for nothing more than a
715 * simple way to tell if a device has encountered
716 * errors.
717 */
718 atomic_inc(&m->error_count);
719
720 if (test_and_set_bit(error_type, &m->error_type))
721 return;
722
723 if (m != get_default_mirror(ms))
724 goto out;
725
726 if (!ms->in_sync) {
727 /*
728 * Better to issue requests to same failing device
729 * than to risk returning corrupt data.
730 */
731 DMERR("Primary mirror (%s) failed while out-of-sync: "
732 "Reads may fail.", m->dev->name);
733 goto out;
734 }
735
736 for (new = ms->mirror; new < ms->mirror + ms->nr_mirrors; new++)
737 if (!atomic_read(&new->error_count)) {
738 set_default_mirror(new);
739 break;
740 }
741
742 if (unlikely(new == ms->mirror + ms->nr_mirrors))
743 DMWARN("All sides of mirror have failed.");
744
745out:
746 schedule_work(&ms->trigger_event);
647} 747}
648 748
649/*----------------------------------------------------------------- 749/*-----------------------------------------------------------------
@@ -656,15 +756,32 @@ static void bio_set_ms(struct bio *bio, struct mirror_set *ms)
656static void recovery_complete(int read_err, unsigned int write_err, 756static void recovery_complete(int read_err, unsigned int write_err,
657 void *context) 757 void *context)
658{ 758{
659 struct region *reg = (struct region *) context; 759 struct region *reg = (struct region *)context;
760 struct mirror_set *ms = reg->rh->ms;
761 int m, bit = 0;
660 762
661 if (read_err) 763 if (read_err) {
662 /* Read error means the failure of default mirror. */ 764 /* Read error means the failure of default mirror. */
663 DMERR_LIMIT("Unable to read primary mirror during recovery"); 765 DMERR_LIMIT("Unable to read primary mirror during recovery");
766 fail_mirror(get_default_mirror(ms), DM_RAID1_SYNC_ERROR);
767 }
664 768
665 if (write_err) 769 if (write_err) {
666 DMERR_LIMIT("Write error during recovery (error = 0x%x)", 770 DMERR_LIMIT("Write error during recovery (error = 0x%x)",
667 write_err); 771 write_err);
772 /*
773 * Bits correspond to devices (excluding default mirror).
774 * The default mirror cannot change during recovery.
775 */
776 for (m = 0; m < ms->nr_mirrors; m++) {
777 if (&ms->mirror[m] == get_default_mirror(ms))
778 continue;
779 if (test_bit(bit, &write_err))
780 fail_mirror(ms->mirror + m,
781 DM_RAID1_SYNC_ERROR);
782 bit++;
783 }
784 }
668 785
669 rh_recovery_end(reg, !(read_err || write_err)); 786 rh_recovery_end(reg, !(read_err || write_err));
670} 787}
@@ -678,7 +795,7 @@ static int recover(struct mirror_set *ms, struct region *reg)
678 unsigned long flags = 0; 795 unsigned long flags = 0;
679 796
680 /* fill in the source */ 797 /* fill in the source */
681 m = ms->default_mirror; 798 m = get_default_mirror(ms);
682 from.bdev = m->dev->bdev; 799 from.bdev = m->dev->bdev;
683 from.sector = m->offset + region_to_sector(reg->rh, reg->key); 800 from.sector = m->offset + region_to_sector(reg->rh, reg->key);
684 if (reg->key == (ms->nr_regions - 1)) { 801 if (reg->key == (ms->nr_regions - 1)) {
@@ -694,7 +811,7 @@ static int recover(struct mirror_set *ms, struct region *reg)
694 811
695 /* fill in the destinations */ 812 /* fill in the destinations */
696 for (i = 0, dest = to; i < ms->nr_mirrors; i++) { 813 for (i = 0, dest = to; i < ms->nr_mirrors; i++) {
697 if (&ms->mirror[i] == ms->default_mirror) 814 if (&ms->mirror[i] == get_default_mirror(ms))
698 continue; 815 continue;
699 816
700 m = ms->mirror + i; 817 m = ms->mirror + i;
@@ -748,17 +865,105 @@ static void do_recovery(struct mirror_set *ms)
748 *---------------------------------------------------------------*/ 865 *---------------------------------------------------------------*/
749static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector) 866static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector)
750{ 867{
751 /* FIXME: add read balancing */ 868 struct mirror *m = get_default_mirror(ms);
752 return ms->default_mirror; 869
870 do {
871 if (likely(!atomic_read(&m->error_count)))
872 return m;
873
874 if (m-- == ms->mirror)
875 m += ms->nr_mirrors;
876 } while (m != get_default_mirror(ms));
877
878 return NULL;
879}
880
881static int default_ok(struct mirror *m)
882{
883 struct mirror *default_mirror = get_default_mirror(m->ms);
884
885 return !atomic_read(&default_mirror->error_count);
886}
887
888static int mirror_available(struct mirror_set *ms, struct bio *bio)
889{
890 region_t region = bio_to_region(&ms->rh, bio);
891
892 if (ms->rh.log->type->in_sync(ms->rh.log, region, 0))
893 return choose_mirror(ms, bio->bi_sector) ? 1 : 0;
894
895 return 0;
753} 896}
754 897
755/* 898/*
756 * remap a buffer to a particular mirror. 899 * remap a buffer to a particular mirror.
757 */ 900 */
758static void map_bio(struct mirror_set *ms, struct mirror *m, struct bio *bio) 901static sector_t map_sector(struct mirror *m, struct bio *bio)
902{
903 return m->offset + (bio->bi_sector - m->ms->ti->begin);
904}
905
906static void map_bio(struct mirror *m, struct bio *bio)
759{ 907{
760 bio->bi_bdev = m->dev->bdev; 908 bio->bi_bdev = m->dev->bdev;
761 bio->bi_sector = m->offset + (bio->bi_sector - ms->ti->begin); 909 bio->bi_sector = map_sector(m, bio);
910}
911
912static void map_region(struct io_region *io, struct mirror *m,
913 struct bio *bio)
914{
915 io->bdev = m->dev->bdev;
916 io->sector = map_sector(m, bio);
917 io->count = bio->bi_size >> 9;
918}
919
920/*-----------------------------------------------------------------
921 * Reads
922 *---------------------------------------------------------------*/
923static void read_callback(unsigned long error, void *context)
924{
925 struct bio *bio = context;
926 struct mirror *m;
927
928 m = bio_get_m(bio);
929 bio_set_m(bio, NULL);
930
931 if (likely(!error)) {
932 bio_endio(bio, 0);
933 return;
934 }
935
936 fail_mirror(m, DM_RAID1_READ_ERROR);
937
938 if (likely(default_ok(m)) || mirror_available(m->ms, bio)) {
939 DMWARN_LIMIT("Read failure on mirror device %s. "
940 "Trying alternative device.",
941 m->dev->name);
942 queue_bio(m->ms, bio, bio_rw(bio));
943 return;
944 }
945
946 DMERR_LIMIT("Read failure on mirror device %s. Failing I/O.",
947 m->dev->name);
948 bio_endio(bio, -EIO);
949}
950
951/* Asynchronous read. */
952static void read_async_bio(struct mirror *m, struct bio *bio)
953{
954 struct io_region io;
955 struct dm_io_request io_req = {
956 .bi_rw = READ,
957 .mem.type = DM_IO_BVEC,
958 .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx,
959 .notify.fn = read_callback,
960 .notify.context = bio,
961 .client = m->ms->io_client,
962 };
963
964 map_region(&io, m, bio);
965 bio_set_m(bio, m);
966 (void) dm_io(&io_req, 1, &io, NULL);
762} 967}
763 968
764static void do_reads(struct mirror_set *ms, struct bio_list *reads) 969static void do_reads(struct mirror_set *ms, struct bio_list *reads)
@@ -769,17 +974,20 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads)
769 974
770 while ((bio = bio_list_pop(reads))) { 975 while ((bio = bio_list_pop(reads))) {
771 region = bio_to_region(&ms->rh, bio); 976 region = bio_to_region(&ms->rh, bio);
977 m = get_default_mirror(ms);
772 978
773 /* 979 /*
774 * We can only read balance if the region is in sync. 980 * We can only read balance if the region is in sync.
775 */ 981 */
776 if (rh_in_sync(&ms->rh, region, 1)) 982 if (likely(rh_in_sync(&ms->rh, region, 1)))
777 m = choose_mirror(ms, bio->bi_sector); 983 m = choose_mirror(ms, bio->bi_sector);
778 else 984 else if (m && atomic_read(&m->error_count))
779 m = ms->default_mirror; 985 m = NULL;
780 986
781 map_bio(ms, m, bio); 987 if (likely(m))
782 generic_make_request(bio); 988 read_async_bio(m, bio);
989 else
990 bio_endio(bio, -EIO);
783 } 991 }
784} 992}
785 993
@@ -793,15 +1001,70 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads)
793 * RECOVERING: delay the io until recovery completes 1001 * RECOVERING: delay the io until recovery completes
794 * NOSYNC: increment pending, just write to the default mirror 1002 * NOSYNC: increment pending, just write to the default mirror
795 *---------------------------------------------------------------*/ 1003 *---------------------------------------------------------------*/
1004
1005/* __bio_mark_nosync
1006 * @ms
1007 * @bio
1008 * @done
1009 * @error
1010 *
1011 * The bio was written on some mirror(s) but failed on other mirror(s).
1012 * We can successfully endio the bio but should avoid the region being
1013 * marked clean by setting the state RH_NOSYNC.
1014 *
1015 * This function is _not_ safe in interrupt context!
1016 */
1017static void __bio_mark_nosync(struct mirror_set *ms,
1018 struct bio *bio, unsigned done, int error)
1019{
1020 unsigned long flags;
1021 struct region_hash *rh = &ms->rh;
1022 struct dirty_log *log = ms->rh.log;
1023 struct region *reg;
1024 region_t region = bio_to_region(rh, bio);
1025 int recovering = 0;
1026
1027 /* We must inform the log that the sync count has changed. */
1028 log->type->set_region_sync(log, region, 0);
1029 ms->in_sync = 0;
1030
1031 read_lock(&rh->hash_lock);
1032 reg = __rh_find(rh, region);
1033 read_unlock(&rh->hash_lock);
1034
1035 /* region hash entry should exist because write was in-flight */
1036 BUG_ON(!reg);
1037 BUG_ON(!list_empty(&reg->list));
1038
1039 spin_lock_irqsave(&rh->region_lock, flags);
1040 /*
1041 * Possible cases:
1042 * 1) RH_DIRTY
1043 * 2) RH_NOSYNC: was dirty, other preceeding writes failed
1044 * 3) RH_RECOVERING: flushing pending writes
1045 * Either case, the region should have not been connected to list.
1046 */
1047 recovering = (reg->state == RH_RECOVERING);
1048 reg->state = RH_NOSYNC;
1049 BUG_ON(!list_empty(&reg->list));
1050 spin_unlock_irqrestore(&rh->region_lock, flags);
1051
1052 bio_endio(bio, error);
1053 if (recovering)
1054 complete_resync_work(reg, 0);
1055}
1056
796static void write_callback(unsigned long error, void *context) 1057static void write_callback(unsigned long error, void *context)
797{ 1058{
798 unsigned int i; 1059 unsigned i, ret = 0;
799 int uptodate = 1;
800 struct bio *bio = (struct bio *) context; 1060 struct bio *bio = (struct bio *) context;
801 struct mirror_set *ms; 1061 struct mirror_set *ms;
1062 int uptodate = 0;
1063 int should_wake = 0;
1064 unsigned long flags;
802 1065
803 ms = bio_get_ms(bio); 1066 ms = bio_get_m(bio)->ms;
804 bio_set_ms(bio, NULL); 1067 bio_set_m(bio, NULL);
805 1068
806 /* 1069 /*
807 * NOTE: We don't decrement the pending count here, 1070 * NOTE: We don't decrement the pending count here,
@@ -809,26 +1072,42 @@ static void write_callback(unsigned long error, void *context)
809 * This way we handle both writes to SYNC and NOSYNC 1072 * This way we handle both writes to SYNC and NOSYNC
810 * regions with the same code. 1073 * regions with the same code.
811 */ 1074 */
1075 if (likely(!error))
1076 goto out;
1077
1078 for (i = 0; i < ms->nr_mirrors; i++)
1079 if (test_bit(i, &error))
1080 fail_mirror(ms->mirror + i, DM_RAID1_WRITE_ERROR);
1081 else
1082 uptodate = 1;
812 1083
813 if (error) { 1084 if (unlikely(!uptodate)) {
1085 DMERR("All replicated volumes dead, failing I/O");
1086 /* None of the writes succeeded, fail the I/O. */
1087 ret = -EIO;
1088 } else if (errors_handled(ms)) {
814 /* 1089 /*
815 * only error the io if all mirrors failed. 1090 * Need to raise event. Since raising
816 * FIXME: bogus 1091 * events can block, we need to do it in
1092 * the main thread.
817 */ 1093 */
818 uptodate = 0; 1094 spin_lock_irqsave(&ms->lock, flags);
819 for (i = 0; i < ms->nr_mirrors; i++) 1095 if (!ms->failures.head)
820 if (!test_bit(i, &error)) { 1096 should_wake = 1;
821 uptodate = 1; 1097 bio_list_add(&ms->failures, bio);
822 break; 1098 spin_unlock_irqrestore(&ms->lock, flags);
823 } 1099 if (should_wake)
1100 wake(ms);
1101 return;
824 } 1102 }
825 bio_endio(bio, 0); 1103out:
1104 bio_endio(bio, ret);
826} 1105}
827 1106
828static void do_write(struct mirror_set *ms, struct bio *bio) 1107static void do_write(struct mirror_set *ms, struct bio *bio)
829{ 1108{
830 unsigned int i; 1109 unsigned int i;
831 struct io_region io[KCOPYD_MAX_REGIONS+1]; 1110 struct io_region io[ms->nr_mirrors], *dest = io;
832 struct mirror *m; 1111 struct mirror *m;
833 struct dm_io_request io_req = { 1112 struct dm_io_request io_req = {
834 .bi_rw = WRITE, 1113 .bi_rw = WRITE,
@@ -839,15 +1118,14 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
839 .client = ms->io_client, 1118 .client = ms->io_client,
840 }; 1119 };
841 1120
842 for (i = 0; i < ms->nr_mirrors; i++) { 1121 for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++)
843 m = ms->mirror + i; 1122 map_region(dest++, m, bio);
844
845 io[i].bdev = m->dev->bdev;
846 io[i].sector = m->offset + (bio->bi_sector - ms->ti->begin);
847 io[i].count = bio->bi_size >> 9;
848 }
849 1123
850 bio_set_ms(bio, ms); 1124 /*
1125 * Use default mirror because we only need it to retrieve the reference
1126 * to the mirror set in write_callback().
1127 */
1128 bio_set_m(bio, get_default_mirror(ms));
851 1129
852 (void) dm_io(&io_req, ms->nr_mirrors, io, NULL); 1130 (void) dm_io(&io_req, ms->nr_mirrors, io, NULL);
853} 1131}
@@ -900,43 +1178,125 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
900 /* 1178 /*
901 * Dispatch io. 1179 * Dispatch io.
902 */ 1180 */
903 if (unlikely(ms->log_failure)) 1181 if (unlikely(ms->log_failure)) {
1182 spin_lock_irq(&ms->lock);
1183 bio_list_merge(&ms->failures, &sync);
1184 spin_unlock_irq(&ms->lock);
1185 } else
904 while ((bio = bio_list_pop(&sync))) 1186 while ((bio = bio_list_pop(&sync)))
905 bio_endio(bio, -EIO); 1187 do_write(ms, bio);
906 else while ((bio = bio_list_pop(&sync)))
907 do_write(ms, bio);
908 1188
909 while ((bio = bio_list_pop(&recover))) 1189 while ((bio = bio_list_pop(&recover)))
910 rh_delay(&ms->rh, bio); 1190 rh_delay(&ms->rh, bio);
911 1191
912 while ((bio = bio_list_pop(&nosync))) { 1192 while ((bio = bio_list_pop(&nosync))) {
913 map_bio(ms, ms->default_mirror, bio); 1193 map_bio(get_default_mirror(ms), bio);
914 generic_make_request(bio); 1194 generic_make_request(bio);
915 } 1195 }
916} 1196}
917 1197
1198static void do_failures(struct mirror_set *ms, struct bio_list *failures)
1199{
1200 struct bio *bio;
1201
1202 if (!failures->head)
1203 return;
1204
1205 if (!ms->log_failure) {
1206 while ((bio = bio_list_pop(failures)))
1207 __bio_mark_nosync(ms, bio, bio->bi_size, 0);
1208 return;
1209 }
1210
1211 /*
1212 * If the log has failed, unattempted writes are being
1213 * put on the failures list. We can't issue those writes
1214 * until a log has been marked, so we must store them.
1215 *
1216 * If a 'noflush' suspend is in progress, we can requeue
1217 * the I/O's to the core. This give userspace a chance
1218 * to reconfigure the mirror, at which point the core
1219 * will reissue the writes. If the 'noflush' flag is
1220 * not set, we have no choice but to return errors.
1221 *
1222 * Some writes on the failures list may have been
1223 * submitted before the log failure and represent a
1224 * failure to write to one of the devices. It is ok
1225 * for us to treat them the same and requeue them
1226 * as well.
1227 */
1228 if (dm_noflush_suspending(ms->ti)) {
1229 while ((bio = bio_list_pop(failures)))
1230 bio_endio(bio, DM_ENDIO_REQUEUE);
1231 return;
1232 }
1233
1234 if (atomic_read(&ms->suspend)) {
1235 while ((bio = bio_list_pop(failures)))
1236 bio_endio(bio, -EIO);
1237 return;
1238 }
1239
1240 spin_lock_irq(&ms->lock);
1241 bio_list_merge(&ms->failures, failures);
1242 spin_unlock_irq(&ms->lock);
1243
1244 wake(ms);
1245}
1246
1247static void trigger_event(struct work_struct *work)
1248{
1249 struct mirror_set *ms =
1250 container_of(work, struct mirror_set, trigger_event);
1251
1252 dm_table_event(ms->ti->table);
1253}
1254
918/*----------------------------------------------------------------- 1255/*-----------------------------------------------------------------
919 * kmirrord 1256 * kmirrord
920 *---------------------------------------------------------------*/ 1257 *---------------------------------------------------------------*/
921static void do_mirror(struct work_struct *work) 1258static int _do_mirror(struct work_struct *work)
922{ 1259{
923 struct mirror_set *ms =container_of(work, struct mirror_set, 1260 struct mirror_set *ms =container_of(work, struct mirror_set,
924 kmirrord_work); 1261 kmirrord_work);
925 struct bio_list reads, writes; 1262 struct bio_list reads, writes, failures;
1263 unsigned long flags;
926 1264
927 spin_lock(&ms->lock); 1265 spin_lock_irqsave(&ms->lock, flags);
928 reads = ms->reads; 1266 reads = ms->reads;
929 writes = ms->writes; 1267 writes = ms->writes;
1268 failures = ms->failures;
930 bio_list_init(&ms->reads); 1269 bio_list_init(&ms->reads);
931 bio_list_init(&ms->writes); 1270 bio_list_init(&ms->writes);
932 spin_unlock(&ms->lock); 1271 bio_list_init(&ms->failures);
1272 spin_unlock_irqrestore(&ms->lock, flags);
933 1273
934 rh_update_states(&ms->rh); 1274 rh_update_states(&ms->rh);
935 do_recovery(ms); 1275 do_recovery(ms);
936 do_reads(ms, &reads); 1276 do_reads(ms, &reads);
937 do_writes(ms, &writes); 1277 do_writes(ms, &writes);
1278 do_failures(ms, &failures);
1279
1280 return (ms->failures.head) ? 1 : 0;
938} 1281}
939 1282
1283static void do_mirror(struct work_struct *work)
1284{
1285 /*
1286 * If _do_mirror returns 1, we give it
1287 * another shot. This helps for cases like
1288 * 'suspend' where we call flush_workqueue
1289 * and expect all work to be finished. If
1290 * a failure happens during a suspend, we
1291 * couldn't issue a 'wake' because it would
1292 * not be honored. Therefore, we return '1'
1293 * from _do_mirror, and retry here.
1294 */
1295 while (_do_mirror(work))
1296 schedule();
1297}
1298
1299
940/*----------------------------------------------------------------- 1300/*-----------------------------------------------------------------
941 * Target functions 1301 * Target functions
942 *---------------------------------------------------------------*/ 1302 *---------------------------------------------------------------*/
@@ -965,11 +1325,23 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
965 ms->nr_mirrors = nr_mirrors; 1325 ms->nr_mirrors = nr_mirrors;
966 ms->nr_regions = dm_sector_div_up(ti->len, region_size); 1326 ms->nr_regions = dm_sector_div_up(ti->len, region_size);
967 ms->in_sync = 0; 1327 ms->in_sync = 0;
968 ms->default_mirror = &ms->mirror[DEFAULT_MIRROR]; 1328 ms->log_failure = 0;
1329 atomic_set(&ms->suspend, 0);
1330 atomic_set(&ms->default_mirror, DEFAULT_MIRROR);
1331
1332 len = sizeof(struct dm_raid1_read_record);
1333 ms->read_record_pool = mempool_create_kmalloc_pool(MIN_READ_RECORDS,
1334 len);
1335 if (!ms->read_record_pool) {
1336 ti->error = "Error creating mirror read_record_pool";
1337 kfree(ms);
1338 return NULL;
1339 }
969 1340
970 ms->io_client = dm_io_client_create(DM_IO_PAGES); 1341 ms->io_client = dm_io_client_create(DM_IO_PAGES);
971 if (IS_ERR(ms->io_client)) { 1342 if (IS_ERR(ms->io_client)) {
972 ti->error = "Error creating dm_io client"; 1343 ti->error = "Error creating dm_io client";
1344 mempool_destroy(ms->read_record_pool);
973 kfree(ms); 1345 kfree(ms);
974 return NULL; 1346 return NULL;
975 } 1347 }
@@ -977,6 +1349,7 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
977 if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) { 1349 if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) {
978 ti->error = "Error creating dirty region hash"; 1350 ti->error = "Error creating dirty region hash";
979 dm_io_client_destroy(ms->io_client); 1351 dm_io_client_destroy(ms->io_client);
1352 mempool_destroy(ms->read_record_pool);
980 kfree(ms); 1353 kfree(ms);
981 return NULL; 1354 return NULL;
982 } 1355 }
@@ -992,6 +1365,7 @@ static void free_context(struct mirror_set *ms, struct dm_target *ti,
992 1365
993 dm_io_client_destroy(ms->io_client); 1366 dm_io_client_destroy(ms->io_client);
994 rh_exit(&ms->rh); 1367 rh_exit(&ms->rh);
1368 mempool_destroy(ms->read_record_pool);
995 kfree(ms); 1369 kfree(ms);
996} 1370}
997 1371
@@ -1019,6 +1393,8 @@ static int get_mirror(struct mirror_set *ms, struct dm_target *ti,
1019 } 1393 }
1020 1394
1021 ms->mirror[mirror].ms = ms; 1395 ms->mirror[mirror].ms = ms;
1396 atomic_set(&(ms->mirror[mirror].error_count), 0);
1397 ms->mirror[mirror].error_type = 0;
1022 ms->mirror[mirror].offset = offset; 1398 ms->mirror[mirror].offset = offset;
1023 1399
1024 return 0; 1400 return 0;
@@ -1171,6 +1547,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1171 goto err_free_context; 1547 goto err_free_context;
1172 } 1548 }
1173 INIT_WORK(&ms->kmirrord_work, do_mirror); 1549 INIT_WORK(&ms->kmirrord_work, do_mirror);
1550 INIT_WORK(&ms->trigger_event, trigger_event);
1174 1551
1175 r = parse_features(ms, argc, argv, &args_used); 1552 r = parse_features(ms, argc, argv, &args_used);
1176 if (r) 1553 if (r)
@@ -1220,14 +1597,15 @@ static void mirror_dtr(struct dm_target *ti)
1220 1597
1221static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw) 1598static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw)
1222{ 1599{
1600 unsigned long flags;
1223 int should_wake = 0; 1601 int should_wake = 0;
1224 struct bio_list *bl; 1602 struct bio_list *bl;
1225 1603
1226 bl = (rw == WRITE) ? &ms->writes : &ms->reads; 1604 bl = (rw == WRITE) ? &ms->writes : &ms->reads;
1227 spin_lock(&ms->lock); 1605 spin_lock_irqsave(&ms->lock, flags);
1228 should_wake = !(bl->head); 1606 should_wake = !(bl->head);
1229 bio_list_add(bl, bio); 1607 bio_list_add(bl, bio);
1230 spin_unlock(&ms->lock); 1608 spin_unlock_irqrestore(&ms->lock, flags);
1231 1609
1232 if (should_wake) 1610 if (should_wake)
1233 wake(ms); 1611 wake(ms);
@@ -1242,10 +1620,11 @@ static int mirror_map(struct dm_target *ti, struct bio *bio,
1242 int r, rw = bio_rw(bio); 1620 int r, rw = bio_rw(bio);
1243 struct mirror *m; 1621 struct mirror *m;
1244 struct mirror_set *ms = ti->private; 1622 struct mirror_set *ms = ti->private;
1245 1623 struct dm_raid1_read_record *read_record = NULL;
1246 map_context->ll = bio_to_region(&ms->rh, bio);
1247 1624
1248 if (rw == WRITE) { 1625 if (rw == WRITE) {
1626 /* Save region for mirror_end_io() handler */
1627 map_context->ll = bio_to_region(&ms->rh, bio);
1249 queue_bio(ms, bio, rw); 1628 queue_bio(ms, bio, rw);
1250 return DM_MAPIO_SUBMITTED; 1629 return DM_MAPIO_SUBMITTED;
1251 } 1630 }
@@ -1255,28 +1634,34 @@ static int mirror_map(struct dm_target *ti, struct bio *bio,
1255 if (r < 0 && r != -EWOULDBLOCK) 1634 if (r < 0 && r != -EWOULDBLOCK)
1256 return r; 1635 return r;
1257 1636
1258 if (r == -EWOULDBLOCK) /* FIXME: ugly */
1259 r = DM_MAPIO_SUBMITTED;
1260
1261 /* 1637 /*
1262 * We don't want to fast track a recovery just for a read 1638 * If region is not in-sync queue the bio.
1263 * ahead. So we just let it silently fail.
1264 * FIXME: get rid of this.
1265 */ 1639 */
1266 if (!r && rw == READA) 1640 if (!r || (r == -EWOULDBLOCK)) {
1267 return -EIO; 1641 if (rw == READA)
1642 return -EWOULDBLOCK;
1268 1643
1269 if (!r) {
1270 /* Pass this io over to the daemon */
1271 queue_bio(ms, bio, rw); 1644 queue_bio(ms, bio, rw);
1272 return DM_MAPIO_SUBMITTED; 1645 return DM_MAPIO_SUBMITTED;
1273 } 1646 }
1274 1647
1648 /*
1649 * The region is in-sync and we can perform reads directly.
1650 * Store enough information so we can retry if it fails.
1651 */
1275 m = choose_mirror(ms, bio->bi_sector); 1652 m = choose_mirror(ms, bio->bi_sector);
1276 if (!m) 1653 if (unlikely(!m))
1277 return -EIO; 1654 return -EIO;
1278 1655
1279 map_bio(ms, m, bio); 1656 read_record = mempool_alloc(ms->read_record_pool, GFP_NOIO);
1657 if (likely(read_record)) {
1658 dm_bio_record(&read_record->details, bio);
1659 map_context->ptr = read_record;
1660 read_record->m = m;
1661 }
1662
1663 map_bio(m, bio);
1664
1280 return DM_MAPIO_REMAPPED; 1665 return DM_MAPIO_REMAPPED;
1281} 1666}
1282 1667
@@ -1285,71 +1670,173 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
1285{ 1670{
1286 int rw = bio_rw(bio); 1671 int rw = bio_rw(bio);
1287 struct mirror_set *ms = (struct mirror_set *) ti->private; 1672 struct mirror_set *ms = (struct mirror_set *) ti->private;
1288 region_t region = map_context->ll; 1673 struct mirror *m = NULL;
1674 struct dm_bio_details *bd = NULL;
1675 struct dm_raid1_read_record *read_record = map_context->ptr;
1289 1676
1290 /* 1677 /*
1291 * We need to dec pending if this was a write. 1678 * We need to dec pending if this was a write.
1292 */ 1679 */
1293 if (rw == WRITE) 1680 if (rw == WRITE) {
1294 rh_dec(&ms->rh, region); 1681 rh_dec(&ms->rh, map_context->ll);
1682 return error;
1683 }
1295 1684
1296 return 0; 1685 if (error == -EOPNOTSUPP)
1686 goto out;
1687
1688 if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio))
1689 goto out;
1690
1691 if (unlikely(error)) {
1692 if (!read_record) {
1693 /*
1694 * There wasn't enough memory to record necessary
1695 * information for a retry or there was no other
1696 * mirror in-sync.
1697 */
1698 DMERR_LIMIT("Mirror read failed from %s.",
1699 m->dev->name);
1700 return -EIO;
1701 }
1702 DMERR("Mirror read failed from %s. Trying alternative device.",
1703 m->dev->name);
1704
1705 m = read_record->m;
1706 fail_mirror(m, DM_RAID1_READ_ERROR);
1707
1708 /*
1709 * A failed read is requeued for another attempt using an intact
1710 * mirror.
1711 */
1712 if (default_ok(m) || mirror_available(ms, bio)) {
1713 bd = &read_record->details;
1714
1715 dm_bio_restore(bd, bio);
1716 mempool_free(read_record, ms->read_record_pool);
1717 map_context->ptr = NULL;
1718 queue_bio(ms, bio, rw);
1719 return 1;
1720 }
1721 DMERR("All replicated volumes dead, failing I/O");
1722 }
1723
1724out:
1725 if (read_record) {
1726 mempool_free(read_record, ms->read_record_pool);
1727 map_context->ptr = NULL;
1728 }
1729
1730 return error;
1297} 1731}
1298 1732
1299static void mirror_postsuspend(struct dm_target *ti) 1733static void mirror_presuspend(struct dm_target *ti)
1300{ 1734{
1301 struct mirror_set *ms = (struct mirror_set *) ti->private; 1735 struct mirror_set *ms = (struct mirror_set *) ti->private;
1302 struct dirty_log *log = ms->rh.log; 1736 struct dirty_log *log = ms->rh.log;
1303 1737
1738 atomic_set(&ms->suspend, 1);
1739
1740 /*
1741 * We must finish up all the work that we've
1742 * generated (i.e. recovery work).
1743 */
1304 rh_stop_recovery(&ms->rh); 1744 rh_stop_recovery(&ms->rh);
1305 1745
1306 /* Wait for all I/O we generated to complete */
1307 wait_event(_kmirrord_recovery_stopped, 1746 wait_event(_kmirrord_recovery_stopped,
1308 !atomic_read(&ms->rh.recovery_in_flight)); 1747 !atomic_read(&ms->rh.recovery_in_flight));
1309 1748
1749 if (log->type->presuspend && log->type->presuspend(log))
1750 /* FIXME: need better error handling */
1751 DMWARN("log presuspend failed");
1752
1753 /*
1754 * Now that recovery is complete/stopped and the
1755 * delayed bios are queued, we need to wait for
1756 * the worker thread to complete. This way,
1757 * we know that all of our I/O has been pushed.
1758 */
1759 flush_workqueue(ms->kmirrord_wq);
1760}
1761
1762static void mirror_postsuspend(struct dm_target *ti)
1763{
1764 struct mirror_set *ms = ti->private;
1765 struct dirty_log *log = ms->rh.log;
1766
1310 if (log->type->postsuspend && log->type->postsuspend(log)) 1767 if (log->type->postsuspend && log->type->postsuspend(log))
1311 /* FIXME: need better error handling */ 1768 /* FIXME: need better error handling */
1312 DMWARN("log suspend failed"); 1769 DMWARN("log postsuspend failed");
1313} 1770}
1314 1771
1315static void mirror_resume(struct dm_target *ti) 1772static void mirror_resume(struct dm_target *ti)
1316{ 1773{
1317 struct mirror_set *ms = (struct mirror_set *) ti->private; 1774 struct mirror_set *ms = ti->private;
1318 struct dirty_log *log = ms->rh.log; 1775 struct dirty_log *log = ms->rh.log;
1776
1777 atomic_set(&ms->suspend, 0);
1319 if (log->type->resume && log->type->resume(log)) 1778 if (log->type->resume && log->type->resume(log))
1320 /* FIXME: need better error handling */ 1779 /* FIXME: need better error handling */
1321 DMWARN("log resume failed"); 1780 DMWARN("log resume failed");
1322 rh_start_recovery(&ms->rh); 1781 rh_start_recovery(&ms->rh);
1323} 1782}
1324 1783
1784/*
1785 * device_status_char
1786 * @m: mirror device/leg we want the status of
1787 *
1788 * We return one character representing the most severe error
1789 * we have encountered.
1790 * A => Alive - No failures
1791 * D => Dead - A write failure occurred leaving mirror out-of-sync
1792 * S => Sync - A sychronization failure occurred, mirror out-of-sync
1793 * R => Read - A read failure occurred, mirror data unaffected
1794 *
1795 * Returns: <char>
1796 */
1797static char device_status_char(struct mirror *m)
1798{
1799 if (!atomic_read(&(m->error_count)))
1800 return 'A';
1801
1802 return (test_bit(DM_RAID1_WRITE_ERROR, &(m->error_type))) ? 'D' :
1803 (test_bit(DM_RAID1_SYNC_ERROR, &(m->error_type))) ? 'S' :
1804 (test_bit(DM_RAID1_READ_ERROR, &(m->error_type))) ? 'R' : 'U';
1805}
1806
1807
1325static int mirror_status(struct dm_target *ti, status_type_t type, 1808static int mirror_status(struct dm_target *ti, status_type_t type,
1326 char *result, unsigned int maxlen) 1809 char *result, unsigned int maxlen)
1327{ 1810{
1328 unsigned int m, sz = 0; 1811 unsigned int m, sz = 0;
1329 struct mirror_set *ms = (struct mirror_set *) ti->private; 1812 struct mirror_set *ms = (struct mirror_set *) ti->private;
1813 struct dirty_log *log = ms->rh.log;
1814 char buffer[ms->nr_mirrors + 1];
1330 1815
1331 switch (type) { 1816 switch (type) {
1332 case STATUSTYPE_INFO: 1817 case STATUSTYPE_INFO:
1333 DMEMIT("%d ", ms->nr_mirrors); 1818 DMEMIT("%d ", ms->nr_mirrors);
1334 for (m = 0; m < ms->nr_mirrors; m++) 1819 for (m = 0; m < ms->nr_mirrors; m++) {
1335 DMEMIT("%s ", ms->mirror[m].dev->name); 1820 DMEMIT("%s ", ms->mirror[m].dev->name);
1821 buffer[m] = device_status_char(&(ms->mirror[m]));
1822 }
1823 buffer[m] = '\0';
1336 1824
1337 DMEMIT("%llu/%llu 0 ", 1825 DMEMIT("%llu/%llu 1 %s ",
1338 (unsigned long long)ms->rh.log->type-> 1826 (unsigned long long)log->type->get_sync_count(ms->rh.log),
1339 get_sync_count(ms->rh.log), 1827 (unsigned long long)ms->nr_regions, buffer);
1340 (unsigned long long)ms->nr_regions);
1341 1828
1342 sz += ms->rh.log->type->status(ms->rh.log, type, result+sz, maxlen-sz); 1829 sz += log->type->status(ms->rh.log, type, result+sz, maxlen-sz);
1343 1830
1344 break; 1831 break;
1345 1832
1346 case STATUSTYPE_TABLE: 1833 case STATUSTYPE_TABLE:
1347 sz = ms->rh.log->type->status(ms->rh.log, type, result, maxlen); 1834 sz = log->type->status(ms->rh.log, type, result, maxlen);
1348 1835
1349 DMEMIT("%d", ms->nr_mirrors); 1836 DMEMIT("%d", ms->nr_mirrors);
1350 for (m = 0; m < ms->nr_mirrors; m++) 1837 for (m = 0; m < ms->nr_mirrors; m++)
1351 DMEMIT(" %s %llu", ms->mirror[m].dev->name, 1838 DMEMIT(" %s %llu", ms->mirror[m].dev->name,
1352 (unsigned long long)ms->mirror[m].offset); 1839 (unsigned long long)ms->mirror[m].offset);
1353 1840
1354 if (ms->features & DM_RAID1_HANDLE_ERRORS) 1841 if (ms->features & DM_RAID1_HANDLE_ERRORS)
1355 DMEMIT(" 1 handle_errors"); 1842 DMEMIT(" 1 handle_errors");
@@ -1360,12 +1847,13 @@ static int mirror_status(struct dm_target *ti, status_type_t type,
1360 1847
1361static struct target_type mirror_target = { 1848static struct target_type mirror_target = {
1362 .name = "mirror", 1849 .name = "mirror",
1363 .version = {1, 0, 3}, 1850 .version = {1, 0, 20},
1364 .module = THIS_MODULE, 1851 .module = THIS_MODULE,
1365 .ctr = mirror_ctr, 1852 .ctr = mirror_ctr,
1366 .dtr = mirror_dtr, 1853 .dtr = mirror_dtr,
1367 .map = mirror_map, 1854 .map = mirror_map,
1368 .end_io = mirror_end_io, 1855 .end_io = mirror_end_io,
1856 .presuspend = mirror_presuspend,
1369 .postsuspend = mirror_postsuspend, 1857 .postsuspend = mirror_postsuspend,
1370 .resume = mirror_resume, 1858 .resume = mirror_resume,
1371 .status = mirror_status, 1859 .status = mirror_status,
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index cee16fadd9ee..ae24eab8cd81 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -213,11 +213,15 @@ static void unregister_snapshot(struct dm_snapshot *s)
213 213
214/* 214/*
215 * Implementation of the exception hash tables. 215 * Implementation of the exception hash tables.
216 * The lowest hash_shift bits of the chunk number are ignored, allowing
217 * some consecutive chunks to be grouped together.
216 */ 218 */
217static int init_exception_table(struct exception_table *et, uint32_t size) 219static int init_exception_table(struct exception_table *et, uint32_t size,
220 unsigned hash_shift)
218{ 221{
219 unsigned int i; 222 unsigned int i;
220 223
224 et->hash_shift = hash_shift;
221 et->hash_mask = size - 1; 225 et->hash_mask = size - 1;
222 et->table = dm_vcalloc(size, sizeof(struct list_head)); 226 et->table = dm_vcalloc(size, sizeof(struct list_head));
223 if (!et->table) 227 if (!et->table)
@@ -248,7 +252,7 @@ static void exit_exception_table(struct exception_table *et, struct kmem_cache *
248 252
249static uint32_t exception_hash(struct exception_table *et, chunk_t chunk) 253static uint32_t exception_hash(struct exception_table *et, chunk_t chunk)
250{ 254{
251 return chunk & et->hash_mask; 255 return (chunk >> et->hash_shift) & et->hash_mask;
252} 256}
253 257
254static void insert_exception(struct exception_table *eh, 258static void insert_exception(struct exception_table *eh,
@@ -275,7 +279,8 @@ static struct dm_snap_exception *lookup_exception(struct exception_table *et,
275 279
276 slot = &et->table[exception_hash(et, chunk)]; 280 slot = &et->table[exception_hash(et, chunk)];
277 list_for_each_entry (e, slot, hash_list) 281 list_for_each_entry (e, slot, hash_list)
278 if (e->old_chunk == chunk) 282 if (chunk >= e->old_chunk &&
283 chunk <= e->old_chunk + dm_consecutive_chunk_count(e))
279 return e; 284 return e;
280 285
281 return NULL; 286 return NULL;
@@ -307,6 +312,49 @@ static void free_pending_exception(struct dm_snap_pending_exception *pe)
307 mempool_free(pe, pending_pool); 312 mempool_free(pe, pending_pool);
308} 313}
309 314
315static void insert_completed_exception(struct dm_snapshot *s,
316 struct dm_snap_exception *new_e)
317{
318 struct exception_table *eh = &s->complete;
319 struct list_head *l;
320 struct dm_snap_exception *e = NULL;
321
322 l = &eh->table[exception_hash(eh, new_e->old_chunk)];
323
324 /* Add immediately if this table doesn't support consecutive chunks */
325 if (!eh->hash_shift)
326 goto out;
327
328 /* List is ordered by old_chunk */
329 list_for_each_entry_reverse(e, l, hash_list) {
330 /* Insert after an existing chunk? */
331 if (new_e->old_chunk == (e->old_chunk +
332 dm_consecutive_chunk_count(e) + 1) &&
333 new_e->new_chunk == (dm_chunk_number(e->new_chunk) +
334 dm_consecutive_chunk_count(e) + 1)) {
335 dm_consecutive_chunk_count_inc(e);
336 free_exception(new_e);
337 return;
338 }
339
340 /* Insert before an existing chunk? */
341 if (new_e->old_chunk == (e->old_chunk - 1) &&
342 new_e->new_chunk == (dm_chunk_number(e->new_chunk) - 1)) {
343 dm_consecutive_chunk_count_inc(e);
344 e->old_chunk--;
345 e->new_chunk--;
346 free_exception(new_e);
347 return;
348 }
349
350 if (new_e->old_chunk > e->old_chunk)
351 break;
352 }
353
354out:
355 list_add(&new_e->hash_list, e ? &e->hash_list : l);
356}
357
310int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new) 358int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new)
311{ 359{
312 struct dm_snap_exception *e; 360 struct dm_snap_exception *e;
@@ -316,8 +364,12 @@ int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new)
316 return -ENOMEM; 364 return -ENOMEM;
317 365
318 e->old_chunk = old; 366 e->old_chunk = old;
367
368 /* Consecutive_count is implicitly initialised to zero */
319 e->new_chunk = new; 369 e->new_chunk = new;
320 insert_exception(&s->complete, e); 370
371 insert_completed_exception(s, e);
372
321 return 0; 373 return 0;
322} 374}
323 375
@@ -334,16 +386,6 @@ static int calc_max_buckets(void)
334} 386}
335 387
336/* 388/*
337 * Rounds a number down to a power of 2.
338 */
339static uint32_t round_down(uint32_t n)
340{
341 while (n & (n - 1))
342 n &= (n - 1);
343 return n;
344}
345
346/*
347 * Allocate room for a suitable hash table. 389 * Allocate room for a suitable hash table.
348 */ 390 */
349static int init_hash_tables(struct dm_snapshot *s) 391static int init_hash_tables(struct dm_snapshot *s)
@@ -361,9 +403,9 @@ static int init_hash_tables(struct dm_snapshot *s)
361 hash_size = min(origin_dev_size, cow_dev_size) >> s->chunk_shift; 403 hash_size = min(origin_dev_size, cow_dev_size) >> s->chunk_shift;
362 hash_size = min(hash_size, max_buckets); 404 hash_size = min(hash_size, max_buckets);
363 405
364 /* Round it down to a power of 2 */ 406 hash_size = rounddown_pow_of_two(hash_size);
365 hash_size = round_down(hash_size); 407 if (init_exception_table(&s->complete, hash_size,
366 if (init_exception_table(&s->complete, hash_size)) 408 DM_CHUNK_CONSECUTIVE_BITS))
367 return -ENOMEM; 409 return -ENOMEM;
368 410
369 /* 411 /*
@@ -374,7 +416,7 @@ static int init_hash_tables(struct dm_snapshot *s)
374 if (hash_size < 64) 416 if (hash_size < 64)
375 hash_size = 64; 417 hash_size = 64;
376 418
377 if (init_exception_table(&s->pending, hash_size)) { 419 if (init_exception_table(&s->pending, hash_size, 0)) {
378 exit_exception_table(&s->complete, exception_cache); 420 exit_exception_table(&s->complete, exception_cache);
379 return -ENOMEM; 421 return -ENOMEM;
380 } 422 }
@@ -733,7 +775,7 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
733 * Add a proper exception, and remove the 775 * Add a proper exception, and remove the
734 * in-flight exception from the list. 776 * in-flight exception from the list.
735 */ 777 */
736 insert_exception(&s->complete, e); 778 insert_completed_exception(s, e);
737 779
738 out: 780 out:
739 remove_exception(&pe->e); 781 remove_exception(&pe->e);
@@ -867,11 +909,12 @@ __find_pending_exception(struct dm_snapshot *s, struct bio *bio)
867} 909}
868 910
869static void remap_exception(struct dm_snapshot *s, struct dm_snap_exception *e, 911static void remap_exception(struct dm_snapshot *s, struct dm_snap_exception *e,
870 struct bio *bio) 912 struct bio *bio, chunk_t chunk)
871{ 913{
872 bio->bi_bdev = s->cow->bdev; 914 bio->bi_bdev = s->cow->bdev;
873 bio->bi_sector = chunk_to_sector(s, e->new_chunk) + 915 bio->bi_sector = chunk_to_sector(s, dm_chunk_number(e->new_chunk) +
874 (bio->bi_sector & s->chunk_mask); 916 (chunk - e->old_chunk)) +
917 (bio->bi_sector & s->chunk_mask);
875} 918}
876 919
877static int snapshot_map(struct dm_target *ti, struct bio *bio, 920static int snapshot_map(struct dm_target *ti, struct bio *bio,
@@ -902,7 +945,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
902 /* If the block is already remapped - use that, else remap it */ 945 /* If the block is already remapped - use that, else remap it */
903 e = lookup_exception(&s->complete, chunk); 946 e = lookup_exception(&s->complete, chunk);
904 if (e) { 947 if (e) {
905 remap_exception(s, e, bio); 948 remap_exception(s, e, bio, chunk);
906 goto out_unlock; 949 goto out_unlock;
907 } 950 }
908 951
@@ -919,7 +962,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
919 goto out_unlock; 962 goto out_unlock;
920 } 963 }
921 964
922 remap_exception(s, &pe->e, bio); 965 remap_exception(s, &pe->e, bio, chunk);
923 bio_list_add(&pe->snapshot_bios, bio); 966 bio_list_add(&pe->snapshot_bios, bio);
924 967
925 r = DM_MAPIO_SUBMITTED; 968 r = DM_MAPIO_SUBMITTED;
@@ -1207,7 +1250,7 @@ static int origin_status(struct dm_target *ti, status_type_t type, char *result,
1207 1250
1208static struct target_type origin_target = { 1251static struct target_type origin_target = {
1209 .name = "snapshot-origin", 1252 .name = "snapshot-origin",
1210 .version = {1, 5, 0}, 1253 .version = {1, 6, 0},
1211 .module = THIS_MODULE, 1254 .module = THIS_MODULE,
1212 .ctr = origin_ctr, 1255 .ctr = origin_ctr,
1213 .dtr = origin_dtr, 1256 .dtr = origin_dtr,
@@ -1218,7 +1261,7 @@ static struct target_type origin_target = {
1218 1261
1219static struct target_type snapshot_target = { 1262static struct target_type snapshot_target = {
1220 .name = "snapshot", 1263 .name = "snapshot",
1221 .version = {1, 5, 0}, 1264 .version = {1, 6, 0},
1222 .module = THIS_MODULE, 1265 .module = THIS_MODULE,
1223 .ctr = snapshot_ctr, 1266 .ctr = snapshot_ctr,
1224 .dtr = snapshot_dtr, 1267 .dtr = snapshot_dtr,
diff --git a/drivers/md/dm-snap.h b/drivers/md/dm-snap.h
index 650e0f1f51d8..93bce5d49742 100644
--- a/drivers/md/dm-snap.h
+++ b/drivers/md/dm-snap.h
@@ -16,19 +16,22 @@
16 16
17struct exception_table { 17struct exception_table {
18 uint32_t hash_mask; 18 uint32_t hash_mask;
19 unsigned hash_shift;
19 struct list_head *table; 20 struct list_head *table;
20}; 21};
21 22
22/* 23/*
23 * The snapshot code deals with largish chunks of the disk at a 24 * The snapshot code deals with largish chunks of the disk at a
24 * time. Typically 64k - 256k. 25 * time. Typically 32k - 512k.
25 */ 26 */
26/* FIXME: can we get away with limiting these to a uint32_t ? */
27typedef sector_t chunk_t; 27typedef sector_t chunk_t;
28 28
29/* 29/*
30 * An exception is used where an old chunk of data has been 30 * An exception is used where an old chunk of data has been
31 * replaced by a new one. 31 * replaced by a new one.
32 * If chunk_t is 64 bits in size, the top 8 bits of new_chunk hold the number
33 * of chunks that follow contiguously. Remaining bits hold the number of the
34 * chunk within the device.
32 */ 35 */
33struct dm_snap_exception { 36struct dm_snap_exception {
34 struct list_head hash_list; 37 struct list_head hash_list;
@@ -38,6 +41,49 @@ struct dm_snap_exception {
38}; 41};
39 42
40/* 43/*
44 * Funtions to manipulate consecutive chunks
45 */
46# if defined(CONFIG_LBD) || (BITS_PER_LONG == 64)
47# define DM_CHUNK_CONSECUTIVE_BITS 8
48# define DM_CHUNK_NUMBER_BITS 56
49
50static inline chunk_t dm_chunk_number(chunk_t chunk)
51{
52 return chunk & (chunk_t)((1ULL << DM_CHUNK_NUMBER_BITS) - 1ULL);
53}
54
55static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e)
56{
57 return e->new_chunk >> DM_CHUNK_NUMBER_BITS;
58}
59
60static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
61{
62 e->new_chunk += (1ULL << DM_CHUNK_NUMBER_BITS);
63
64 BUG_ON(!dm_consecutive_chunk_count(e));
65}
66
67# else
68# define DM_CHUNK_CONSECUTIVE_BITS 0
69
70static inline chunk_t dm_chunk_number(chunk_t chunk)
71{
72 return chunk;
73}
74
75static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e)
76{
77 return 0;
78}
79
80static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
81{
82}
83
84# endif
85
86/*
41 * Abstraction to handle the meta/layout of exception stores (the 87 * Abstraction to handle the meta/layout of exception stores (the
42 * COW device). 88 * COW device).
43 */ 89 */
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 969944a8aba2..4de90ab3968b 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -14,10 +14,13 @@
14#include <linux/log2.h> 14#include <linux/log2.h>
15 15
16#define DM_MSG_PREFIX "striped" 16#define DM_MSG_PREFIX "striped"
17#define DM_IO_ERROR_THRESHOLD 15
17 18
18struct stripe { 19struct stripe {
19 struct dm_dev *dev; 20 struct dm_dev *dev;
20 sector_t physical_start; 21 sector_t physical_start;
22
23 atomic_t error_count;
21}; 24};
22 25
23struct stripe_c { 26struct stripe_c {
@@ -30,9 +33,29 @@ struct stripe_c {
30 uint32_t chunk_shift; 33 uint32_t chunk_shift;
31 sector_t chunk_mask; 34 sector_t chunk_mask;
32 35
36 /* Needed for handling events */
37 struct dm_target *ti;
38
39 /* Work struct used for triggering events*/
40 struct work_struct kstriped_ws;
41
33 struct stripe stripe[0]; 42 struct stripe stripe[0];
34}; 43};
35 44
45static struct workqueue_struct *kstriped;
46
47/*
48 * An event is triggered whenever a drive
49 * drops out of a stripe volume.
50 */
51static void trigger_event(struct work_struct *work)
52{
53 struct stripe_c *sc = container_of(work, struct stripe_c, kstriped_ws);
54
55 dm_table_event(sc->ti->table);
56
57}
58
36static inline struct stripe_c *alloc_context(unsigned int stripes) 59static inline struct stripe_c *alloc_context(unsigned int stripes)
37{ 60{
38 size_t len; 61 size_t len;
@@ -63,6 +86,7 @@ static int get_stripe(struct dm_target *ti, struct stripe_c *sc,
63 return -ENXIO; 86 return -ENXIO;
64 87
65 sc->stripe[stripe].physical_start = start; 88 sc->stripe[stripe].physical_start = start;
89
66 return 0; 90 return 0;
67} 91}
68 92
@@ -135,6 +159,11 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
135 return -ENOMEM; 159 return -ENOMEM;
136 } 160 }
137 161
162 INIT_WORK(&sc->kstriped_ws, trigger_event);
163
164 /* Set pointer to dm target; used in trigger_event */
165 sc->ti = ti;
166
138 sc->stripes = stripes; 167 sc->stripes = stripes;
139 sc->stripe_width = width; 168 sc->stripe_width = width;
140 ti->split_io = chunk_size; 169 ti->split_io = chunk_size;
@@ -158,9 +187,11 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
158 kfree(sc); 187 kfree(sc);
159 return r; 188 return r;
160 } 189 }
190 atomic_set(&(sc->stripe[i].error_count), 0);
161 } 191 }
162 192
163 ti->private = sc; 193 ti->private = sc;
194
164 return 0; 195 return 0;
165} 196}
166 197
@@ -172,6 +203,7 @@ static void stripe_dtr(struct dm_target *ti)
172 for (i = 0; i < sc->stripes; i++) 203 for (i = 0; i < sc->stripes; i++)
173 dm_put_device(ti, sc->stripe[i].dev); 204 dm_put_device(ti, sc->stripe[i].dev);
174 205
206 flush_workqueue(kstriped);
175 kfree(sc); 207 kfree(sc);
176} 208}
177 209
@@ -190,16 +222,37 @@ static int stripe_map(struct dm_target *ti, struct bio *bio,
190 return DM_MAPIO_REMAPPED; 222 return DM_MAPIO_REMAPPED;
191} 223}
192 224
225/*
226 * Stripe status:
227 *
228 * INFO
229 * #stripes [stripe_name <stripe_name>] [group word count]
230 * [error count 'A|D' <error count 'A|D'>]
231 *
232 * TABLE
233 * #stripes [stripe chunk size]
234 * [stripe_name physical_start <stripe_name physical_start>]
235 *
236 */
237
193static int stripe_status(struct dm_target *ti, 238static int stripe_status(struct dm_target *ti,
194 status_type_t type, char *result, unsigned int maxlen) 239 status_type_t type, char *result, unsigned int maxlen)
195{ 240{
196 struct stripe_c *sc = (struct stripe_c *) ti->private; 241 struct stripe_c *sc = (struct stripe_c *) ti->private;
242 char buffer[sc->stripes + 1];
197 unsigned int sz = 0; 243 unsigned int sz = 0;
198 unsigned int i; 244 unsigned int i;
199 245
200 switch (type) { 246 switch (type) {
201 case STATUSTYPE_INFO: 247 case STATUSTYPE_INFO:
202 result[0] = '\0'; 248 DMEMIT("%d ", sc->stripes);
249 for (i = 0; i < sc->stripes; i++) {
250 DMEMIT("%s ", sc->stripe[i].dev->name);
251 buffer[i] = atomic_read(&(sc->stripe[i].error_count)) ?
252 'D' : 'A';
253 }
254 buffer[i] = '\0';
255 DMEMIT("1 %s", buffer);
203 break; 256 break;
204 257
205 case STATUSTYPE_TABLE: 258 case STATUSTYPE_TABLE:
@@ -213,13 +266,52 @@ static int stripe_status(struct dm_target *ti,
213 return 0; 266 return 0;
214} 267}
215 268
269static int stripe_end_io(struct dm_target *ti, struct bio *bio,
270 int error, union map_info *map_context)
271{
272 unsigned i;
273 char major_minor[16];
274 struct stripe_c *sc = ti->private;
275
276 if (!error)
277 return 0; /* I/O complete */
278
279 if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio))
280 return error;
281
282 if (error == -EOPNOTSUPP)
283 return error;
284
285 memset(major_minor, 0, sizeof(major_minor));
286 sprintf(major_minor, "%d:%d",
287 bio->bi_bdev->bd_disk->major,
288 bio->bi_bdev->bd_disk->first_minor);
289
290 /*
291 * Test to see which stripe drive triggered the event
292 * and increment error count for all stripes on that device.
293 * If the error count for a given device exceeds the threshold
294 * value we will no longer trigger any further events.
295 */
296 for (i = 0; i < sc->stripes; i++)
297 if (!strcmp(sc->stripe[i].dev->name, major_minor)) {
298 atomic_inc(&(sc->stripe[i].error_count));
299 if (atomic_read(&(sc->stripe[i].error_count)) <
300 DM_IO_ERROR_THRESHOLD)
301 queue_work(kstriped, &sc->kstriped_ws);
302 }
303
304 return error;
305}
306
216static struct target_type stripe_target = { 307static struct target_type stripe_target = {
217 .name = "striped", 308 .name = "striped",
218 .version= {1, 0, 2}, 309 .version = {1, 1, 0},
219 .module = THIS_MODULE, 310 .module = THIS_MODULE,
220 .ctr = stripe_ctr, 311 .ctr = stripe_ctr,
221 .dtr = stripe_dtr, 312 .dtr = stripe_dtr,
222 .map = stripe_map, 313 .map = stripe_map,
314 .end_io = stripe_end_io,
223 .status = stripe_status, 315 .status = stripe_status,
224}; 316};
225 317
@@ -231,6 +323,13 @@ int __init dm_stripe_init(void)
231 if (r < 0) 323 if (r < 0)
232 DMWARN("target registration failed"); 324 DMWARN("target registration failed");
233 325
326 kstriped = create_singlethread_workqueue("kstriped");
327 if (!kstriped) {
328 DMERR("failed to create workqueue kstriped");
329 dm_unregister_target(&stripe_target);
330 return -ENOMEM;
331 }
332
234 return r; 333 return r;
235} 334}
236 335
@@ -239,5 +338,7 @@ void dm_stripe_exit(void)
239 if (dm_unregister_target(&stripe_target)) 338 if (dm_unregister_target(&stripe_target))
240 DMWARN("target unregistration failed"); 339 DMWARN("target unregistration failed");
241 340
341 destroy_workqueue(kstriped);
342
242 return; 343 return;
243} 344}
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 47818d8249cb..f16062982383 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -287,9 +287,8 @@ static void free_devices(struct list_head *devices)
287{ 287{
288 struct list_head *tmp, *next; 288 struct list_head *tmp, *next;
289 289
290 for (tmp = devices->next; tmp != devices; tmp = next) { 290 list_for_each_safe(tmp, next, devices) {
291 struct dm_dev *dd = list_entry(tmp, struct dm_dev, list); 291 struct dm_dev *dd = list_entry(tmp, struct dm_dev, list);
292 next = tmp->next;
293 kfree(dd); 292 kfree(dd);
294 } 293 }
295} 294}
@@ -476,7 +475,7 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti,
476 int mode, struct dm_dev **result) 475 int mode, struct dm_dev **result)
477{ 476{
478 int r; 477 int r;
479 dev_t dev; 478 dev_t uninitialized_var(dev);
480 struct dm_dev *dd; 479 struct dm_dev *dd;
481 unsigned int major, minor; 480 unsigned int major, minor;
482 481
@@ -805,7 +804,7 @@ static int setup_indexes(struct dm_table *t)
805 return -ENOMEM; 804 return -ENOMEM;
806 805
807 /* set up internal nodes, bottom-up */ 806 /* set up internal nodes, bottom-up */
808 for (i = t->depth - 2, total = 0; i >= 0; i--) { 807 for (i = t->depth - 2; i >= 0; i--) {
809 t->index[i] = indexes; 808 t->index[i] = indexes;
810 indexes += (KEYS_PER_NODE * t->counts[i]); 809 indexes += (KEYS_PER_NODE * t->counts[i]);
811 setup_btree_index(i, t); 810 setup_btree_index(i, t);
@@ -993,12 +992,11 @@ int dm_table_resume_targets(struct dm_table *t)
993 992
994int dm_table_any_congested(struct dm_table *t, int bdi_bits) 993int dm_table_any_congested(struct dm_table *t, int bdi_bits)
995{ 994{
996 struct list_head *d, *devices; 995 struct dm_dev *dd;
996 struct list_head *devices = dm_table_get_devices(t);
997 int r = 0; 997 int r = 0;
998 998
999 devices = dm_table_get_devices(t); 999 list_for_each_entry(dd, devices, list) {
1000 for (d = devices->next; d != devices; d = d->next) {
1001 struct dm_dev *dd = list_entry(d, struct dm_dev, list);
1002 struct request_queue *q = bdev_get_queue(dd->bdev); 1000 struct request_queue *q = bdev_get_queue(dd->bdev);
1003 r |= bdi_congested(&q->backing_dev_info, bdi_bits); 1001 r |= bdi_congested(&q->backing_dev_info, bdi_bits);
1004 } 1002 }
@@ -1008,10 +1006,10 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits)
1008 1006
1009void dm_table_unplug_all(struct dm_table *t) 1007void dm_table_unplug_all(struct dm_table *t)
1010{ 1008{
1011 struct list_head *d, *devices = dm_table_get_devices(t); 1009 struct dm_dev *dd;
1010 struct list_head *devices = dm_table_get_devices(t);
1012 1011
1013 for (d = devices->next; d != devices; d = d->next) { 1012 list_for_each_entry(dd, devices, list) {
1014 struct dm_dev *dd = list_entry(d, struct dm_dev, list);
1015 struct request_queue *q = bdev_get_queue(dd->bdev); 1013 struct request_queue *q = bdev_get_queue(dd->bdev);
1016 1014
1017 blk_unplug(q); 1015 blk_unplug(q);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index f2d24eb3208c..6617ce4af095 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -71,9 +71,22 @@ union map_info *dm_get_mapinfo(struct bio *bio)
71#define DMF_DELETING 4 71#define DMF_DELETING 4
72#define DMF_NOFLUSH_SUSPENDING 5 72#define DMF_NOFLUSH_SUSPENDING 5
73 73
74/*
75 * Work processed by per-device workqueue.
76 */
77struct dm_wq_req {
78 enum {
79 DM_WQ_FLUSH_ALL,
80 DM_WQ_FLUSH_DEFERRED,
81 } type;
82 struct work_struct work;
83 struct mapped_device *md;
84 void *context;
85};
86
74struct mapped_device { 87struct mapped_device {
75 struct rw_semaphore io_lock; 88 struct rw_semaphore io_lock;
76 struct semaphore suspend_lock; 89 struct mutex suspend_lock;
77 spinlock_t pushback_lock; 90 spinlock_t pushback_lock;
78 rwlock_t map_lock; 91 rwlock_t map_lock;
79 atomic_t holders; 92 atomic_t holders;
@@ -96,6 +109,11 @@ struct mapped_device {
96 struct bio_list pushback; 109 struct bio_list pushback;
97 110
98 /* 111 /*
112 * Processing queue (flush/barriers)
113 */
114 struct workqueue_struct *wq;
115
116 /*
99 * The current mapping. 117 * The current mapping.
100 */ 118 */
101 struct dm_table *map; 119 struct dm_table *map;
@@ -181,7 +199,7 @@ static void local_exit(void)
181 DMINFO("cleaned up"); 199 DMINFO("cleaned up");
182} 200}
183 201
184int (*_inits[])(void) __initdata = { 202static int (*_inits[])(void) __initdata = {
185 local_init, 203 local_init,
186 dm_target_init, 204 dm_target_init,
187 dm_linear_init, 205 dm_linear_init,
@@ -189,7 +207,7 @@ int (*_inits[])(void) __initdata = {
189 dm_interface_init, 207 dm_interface_init,
190}; 208};
191 209
192void (*_exits[])(void) = { 210static void (*_exits[])(void) = {
193 local_exit, 211 local_exit,
194 dm_target_exit, 212 dm_target_exit,
195 dm_linear_exit, 213 dm_linear_exit,
@@ -982,7 +1000,7 @@ static struct mapped_device *alloc_dev(int minor)
982 } 1000 }
983 1001
984 if (!try_module_get(THIS_MODULE)) 1002 if (!try_module_get(THIS_MODULE))
985 goto bad0; 1003 goto bad_module_get;
986 1004
987 /* get a minor number for the dev */ 1005 /* get a minor number for the dev */
988 if (minor == DM_ANY_MINOR) 1006 if (minor == DM_ANY_MINOR)
@@ -990,11 +1008,11 @@ static struct mapped_device *alloc_dev(int minor)
990 else 1008 else
991 r = specific_minor(md, minor); 1009 r = specific_minor(md, minor);
992 if (r < 0) 1010 if (r < 0)
993 goto bad1; 1011 goto bad_minor;
994 1012
995 memset(md, 0, sizeof(*md)); 1013 memset(md, 0, sizeof(*md));
996 init_rwsem(&md->io_lock); 1014 init_rwsem(&md->io_lock);
997 init_MUTEX(&md->suspend_lock); 1015 mutex_init(&md->suspend_lock);
998 spin_lock_init(&md->pushback_lock); 1016 spin_lock_init(&md->pushback_lock);
999 rwlock_init(&md->map_lock); 1017 rwlock_init(&md->map_lock);
1000 atomic_set(&md->holders, 1); 1018 atomic_set(&md->holders, 1);
@@ -1006,7 +1024,7 @@ static struct mapped_device *alloc_dev(int minor)
1006 1024
1007 md->queue = blk_alloc_queue(GFP_KERNEL); 1025 md->queue = blk_alloc_queue(GFP_KERNEL);
1008 if (!md->queue) 1026 if (!md->queue)
1009 goto bad1_free_minor; 1027 goto bad_queue;
1010 1028
1011 md->queue->queuedata = md; 1029 md->queue->queuedata = md;
1012 md->queue->backing_dev_info.congested_fn = dm_any_congested; 1030 md->queue->backing_dev_info.congested_fn = dm_any_congested;
@@ -1017,11 +1035,11 @@ static struct mapped_device *alloc_dev(int minor)
1017 1035
1018 md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache); 1036 md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache);
1019 if (!md->io_pool) 1037 if (!md->io_pool)
1020 goto bad2; 1038 goto bad_io_pool;
1021 1039
1022 md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache); 1040 md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache);
1023 if (!md->tio_pool) 1041 if (!md->tio_pool)
1024 goto bad3; 1042 goto bad_tio_pool;
1025 1043
1026 md->bs = bioset_create(16, 16); 1044 md->bs = bioset_create(16, 16);
1027 if (!md->bs) 1045 if (!md->bs)
@@ -1029,7 +1047,7 @@ static struct mapped_device *alloc_dev(int minor)
1029 1047
1030 md->disk = alloc_disk(1); 1048 md->disk = alloc_disk(1);
1031 if (!md->disk) 1049 if (!md->disk)
1032 goto bad4; 1050 goto bad_disk;
1033 1051
1034 atomic_set(&md->pending, 0); 1052 atomic_set(&md->pending, 0);
1035 init_waitqueue_head(&md->wait); 1053 init_waitqueue_head(&md->wait);
@@ -1044,6 +1062,10 @@ static struct mapped_device *alloc_dev(int minor)
1044 add_disk(md->disk); 1062 add_disk(md->disk);
1045 format_dev_t(md->name, MKDEV(_major, minor)); 1063 format_dev_t(md->name, MKDEV(_major, minor));
1046 1064
1065 md->wq = create_singlethread_workqueue("kdmflush");
1066 if (!md->wq)
1067 goto bad_thread;
1068
1047 /* Populate the mapping, nobody knows we exist yet */ 1069 /* Populate the mapping, nobody knows we exist yet */
1048 spin_lock(&_minor_lock); 1070 spin_lock(&_minor_lock);
1049 old_md = idr_replace(&_minor_idr, md, minor); 1071 old_md = idr_replace(&_minor_idr, md, minor);
@@ -1053,19 +1075,21 @@ static struct mapped_device *alloc_dev(int minor)
1053 1075
1054 return md; 1076 return md;
1055 1077
1056 bad4: 1078bad_thread:
1079 put_disk(md->disk);
1080bad_disk:
1057 bioset_free(md->bs); 1081 bioset_free(md->bs);
1058 bad_no_bioset: 1082bad_no_bioset:
1059 mempool_destroy(md->tio_pool); 1083 mempool_destroy(md->tio_pool);
1060 bad3: 1084bad_tio_pool:
1061 mempool_destroy(md->io_pool); 1085 mempool_destroy(md->io_pool);
1062 bad2: 1086bad_io_pool:
1063 blk_cleanup_queue(md->queue); 1087 blk_cleanup_queue(md->queue);
1064 bad1_free_minor: 1088bad_queue:
1065 free_minor(minor); 1089 free_minor(minor);
1066 bad1: 1090bad_minor:
1067 module_put(THIS_MODULE); 1091 module_put(THIS_MODULE);
1068 bad0: 1092bad_module_get:
1069 kfree(md); 1093 kfree(md);
1070 return NULL; 1094 return NULL;
1071} 1095}
@@ -1080,6 +1104,7 @@ static void free_dev(struct mapped_device *md)
1080 unlock_fs(md); 1104 unlock_fs(md);
1081 bdput(md->suspended_bdev); 1105 bdput(md->suspended_bdev);
1082 } 1106 }
1107 destroy_workqueue(md->wq);
1083 mempool_destroy(md->tio_pool); 1108 mempool_destroy(md->tio_pool);
1084 mempool_destroy(md->io_pool); 1109 mempool_destroy(md->io_pool);
1085 bioset_free(md->bs); 1110 bioset_free(md->bs);
@@ -1259,20 +1284,91 @@ void dm_put(struct mapped_device *md)
1259} 1284}
1260EXPORT_SYMBOL_GPL(dm_put); 1285EXPORT_SYMBOL_GPL(dm_put);
1261 1286
1287static int dm_wait_for_completion(struct mapped_device *md)
1288{
1289 int r = 0;
1290
1291 while (1) {
1292 set_current_state(TASK_INTERRUPTIBLE);
1293
1294 smp_mb();
1295 if (!atomic_read(&md->pending))
1296 break;
1297
1298 if (signal_pending(current)) {
1299 r = -EINTR;
1300 break;
1301 }
1302
1303 io_schedule();
1304 }
1305 set_current_state(TASK_RUNNING);
1306
1307 return r;
1308}
1309
1262/* 1310/*
1263 * Process the deferred bios 1311 * Process the deferred bios
1264 */ 1312 */
1265static void __flush_deferred_io(struct mapped_device *md, struct bio *c) 1313static void __flush_deferred_io(struct mapped_device *md)
1266{ 1314{
1267 struct bio *n; 1315 struct bio *c;
1268 1316
1269 while (c) { 1317 while ((c = bio_list_pop(&md->deferred))) {
1270 n = c->bi_next;
1271 c->bi_next = NULL;
1272 if (__split_bio(md, c)) 1318 if (__split_bio(md, c))
1273 bio_io_error(c); 1319 bio_io_error(c);
1274 c = n;
1275 } 1320 }
1321
1322 clear_bit(DMF_BLOCK_IO, &md->flags);
1323}
1324
1325static void __merge_pushback_list(struct mapped_device *md)
1326{
1327 unsigned long flags;
1328
1329 spin_lock_irqsave(&md->pushback_lock, flags);
1330 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
1331 bio_list_merge_head(&md->deferred, &md->pushback);
1332 bio_list_init(&md->pushback);
1333 spin_unlock_irqrestore(&md->pushback_lock, flags);
1334}
1335
1336static void dm_wq_work(struct work_struct *work)
1337{
1338 struct dm_wq_req *req = container_of(work, struct dm_wq_req, work);
1339 struct mapped_device *md = req->md;
1340
1341 down_write(&md->io_lock);
1342 switch (req->type) {
1343 case DM_WQ_FLUSH_ALL:
1344 __merge_pushback_list(md);
1345 /* pass through */
1346 case DM_WQ_FLUSH_DEFERRED:
1347 __flush_deferred_io(md);
1348 break;
1349 default:
1350 DMERR("dm_wq_work: unrecognised work type %d", req->type);
1351 BUG();
1352 }
1353 up_write(&md->io_lock);
1354}
1355
1356static void dm_wq_queue(struct mapped_device *md, int type, void *context,
1357 struct dm_wq_req *req)
1358{
1359 req->type = type;
1360 req->md = md;
1361 req->context = context;
1362 INIT_WORK(&req->work, dm_wq_work);
1363 queue_work(md->wq, &req->work);
1364}
1365
1366static void dm_queue_flush(struct mapped_device *md, int type, void *context)
1367{
1368 struct dm_wq_req req;
1369
1370 dm_wq_queue(md, type, context, &req);
1371 flush_workqueue(md->wq);
1276} 1372}
1277 1373
1278/* 1374/*
@@ -1282,7 +1378,7 @@ int dm_swap_table(struct mapped_device *md, struct dm_table *table)
1282{ 1378{
1283 int r = -EINVAL; 1379 int r = -EINVAL;
1284 1380
1285 down(&md->suspend_lock); 1381 mutex_lock(&md->suspend_lock);
1286 1382
1287 /* device must be suspended */ 1383 /* device must be suspended */
1288 if (!dm_suspended(md)) 1384 if (!dm_suspended(md))
@@ -1297,7 +1393,7 @@ int dm_swap_table(struct mapped_device *md, struct dm_table *table)
1297 r = __bind(md, table); 1393 r = __bind(md, table);
1298 1394
1299out: 1395out:
1300 up(&md->suspend_lock); 1396 mutex_unlock(&md->suspend_lock);
1301 return r; 1397 return r;
1302} 1398}
1303 1399
@@ -1346,17 +1442,17 @@ static void unlock_fs(struct mapped_device *md)
1346int dm_suspend(struct mapped_device *md, unsigned suspend_flags) 1442int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
1347{ 1443{
1348 struct dm_table *map = NULL; 1444 struct dm_table *map = NULL;
1349 unsigned long flags;
1350 DECLARE_WAITQUEUE(wait, current); 1445 DECLARE_WAITQUEUE(wait, current);
1351 struct bio *def; 1446 int r = 0;
1352 int r = -EINVAL;
1353 int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0; 1447 int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0;
1354 int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0; 1448 int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0;
1355 1449
1356 down(&md->suspend_lock); 1450 mutex_lock(&md->suspend_lock);
1357 1451
1358 if (dm_suspended(md)) 1452 if (dm_suspended(md)) {
1453 r = -EINVAL;
1359 goto out_unlock; 1454 goto out_unlock;
1455 }
1360 1456
1361 map = dm_get_table(md); 1457 map = dm_get_table(md);
1362 1458
@@ -1378,16 +1474,16 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
1378 r = -ENOMEM; 1474 r = -ENOMEM;
1379 goto flush_and_out; 1475 goto flush_and_out;
1380 } 1476 }
1381 }
1382 1477
1383 /* 1478 /*
1384 * Flush I/O to the device. 1479 * Flush I/O to the device. noflush supersedes do_lockfs,
1385 * noflush supersedes do_lockfs, because lock_fs() needs to flush I/Os. 1480 * because lock_fs() needs to flush I/Os.
1386 */ 1481 */
1387 if (do_lockfs && !noflush) { 1482 if (do_lockfs) {
1388 r = lock_fs(md); 1483 r = lock_fs(md);
1389 if (r) 1484 if (r)
1390 goto out; 1485 goto out;
1486 }
1391 } 1487 }
1392 1488
1393 /* 1489 /*
@@ -1404,66 +1500,36 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
1404 dm_table_unplug_all(map); 1500 dm_table_unplug_all(map);
1405 1501
1406 /* 1502 /*
1407 * Then we wait for the already mapped ios to 1503 * Wait for the already-mapped ios to complete.
1408 * complete.
1409 */ 1504 */
1410 while (1) { 1505 r = dm_wait_for_completion(md);
1411 set_current_state(TASK_INTERRUPTIBLE);
1412
1413 if (!atomic_read(&md->pending) || signal_pending(current))
1414 break;
1415
1416 io_schedule();
1417 }
1418 set_current_state(TASK_RUNNING);
1419 1506
1420 down_write(&md->io_lock); 1507 down_write(&md->io_lock);
1421 remove_wait_queue(&md->wait, &wait); 1508 remove_wait_queue(&md->wait, &wait);
1422 1509
1423 if (noflush) { 1510 if (noflush)
1424 spin_lock_irqsave(&md->pushback_lock, flags); 1511 __merge_pushback_list(md);
1425 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 1512 up_write(&md->io_lock);
1426 bio_list_merge_head(&md->deferred, &md->pushback);
1427 bio_list_init(&md->pushback);
1428 spin_unlock_irqrestore(&md->pushback_lock, flags);
1429 }
1430 1513
1431 /* were we interrupted ? */ 1514 /* were we interrupted ? */
1432 r = -EINTR; 1515 if (r < 0) {
1433 if (atomic_read(&md->pending)) { 1516 dm_queue_flush(md, DM_WQ_FLUSH_DEFERRED, NULL);
1434 clear_bit(DMF_BLOCK_IO, &md->flags); 1517
1435 def = bio_list_get(&md->deferred);
1436 __flush_deferred_io(md, def);
1437 up_write(&md->io_lock);
1438 unlock_fs(md); 1518 unlock_fs(md);
1439 goto out; /* pushback list is already flushed, so skip flush */ 1519 goto out; /* pushback list is already flushed, so skip flush */
1440 } 1520 }
1441 up_write(&md->io_lock);
1442 1521
1443 dm_table_postsuspend_targets(map); 1522 dm_table_postsuspend_targets(map);
1444 1523
1445 set_bit(DMF_SUSPENDED, &md->flags); 1524 set_bit(DMF_SUSPENDED, &md->flags);
1446 1525
1447 r = 0;
1448
1449flush_and_out: 1526flush_and_out:
1450 if (r && noflush) { 1527 if (r && noflush)
1451 /* 1528 /*
1452 * Because there may be already I/Os in the pushback list, 1529 * Because there may be already I/Os in the pushback list,
1453 * flush them before return. 1530 * flush them before return.
1454 */ 1531 */
1455 down_write(&md->io_lock); 1532 dm_queue_flush(md, DM_WQ_FLUSH_ALL, NULL);
1456
1457 spin_lock_irqsave(&md->pushback_lock, flags);
1458 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
1459 bio_list_merge_head(&md->deferred, &md->pushback);
1460 bio_list_init(&md->pushback);
1461 spin_unlock_irqrestore(&md->pushback_lock, flags);
1462
1463 def = bio_list_get(&md->deferred);
1464 __flush_deferred_io(md, def);
1465 up_write(&md->io_lock);
1466 }
1467 1533
1468out: 1534out:
1469 if (r && md->suspended_bdev) { 1535 if (r && md->suspended_bdev) {
@@ -1474,17 +1540,16 @@ out:
1474 dm_table_put(map); 1540 dm_table_put(map);
1475 1541
1476out_unlock: 1542out_unlock:
1477 up(&md->suspend_lock); 1543 mutex_unlock(&md->suspend_lock);
1478 return r; 1544 return r;
1479} 1545}
1480 1546
1481int dm_resume(struct mapped_device *md) 1547int dm_resume(struct mapped_device *md)
1482{ 1548{
1483 int r = -EINVAL; 1549 int r = -EINVAL;
1484 struct bio *def;
1485 struct dm_table *map = NULL; 1550 struct dm_table *map = NULL;
1486 1551
1487 down(&md->suspend_lock); 1552 mutex_lock(&md->suspend_lock);
1488 if (!dm_suspended(md)) 1553 if (!dm_suspended(md))
1489 goto out; 1554 goto out;
1490 1555
@@ -1496,12 +1561,7 @@ int dm_resume(struct mapped_device *md)
1496 if (r) 1561 if (r)
1497 goto out; 1562 goto out;
1498 1563
1499 down_write(&md->io_lock); 1564 dm_queue_flush(md, DM_WQ_FLUSH_DEFERRED, NULL);
1500 clear_bit(DMF_BLOCK_IO, &md->flags);
1501
1502 def = bio_list_get(&md->deferred);
1503 __flush_deferred_io(md, def);
1504 up_write(&md->io_lock);
1505 1565
1506 unlock_fs(md); 1566 unlock_fs(md);
1507 1567
@@ -1520,7 +1580,7 @@ int dm_resume(struct mapped_device *md)
1520 1580
1521out: 1581out:
1522 dm_table_put(map); 1582 dm_table_put(map);
1523 up(&md->suspend_lock); 1583 mutex_unlock(&md->suspend_lock);
1524 1584
1525 return r; 1585 return r;
1526} 1586}