aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Makefile2
-rw-r--r--drivers/md/dm-crypt.c56
-rw-r--r--drivers/md/dm-delay.c3
-rw-r--r--drivers/md/dm-exception-store.c108
-rw-r--r--drivers/md/dm-io.c2
-rw-r--r--drivers/md/dm-kcopyd.c14
-rw-r--r--drivers/md/dm-linear.c2
-rw-r--r--drivers/md/dm-log.c2
-rw-r--r--drivers/md/dm-mpath.c3
-rw-r--r--drivers/md/dm-path-selector.c3
-rw-r--r--drivers/md/dm-raid1.c791
-rw-r--r--drivers/md/dm-region-hash.c704
-rw-r--r--drivers/md/dm-round-robin.c3
-rw-r--r--drivers/md/dm-snap.c11
-rw-r--r--drivers/md/dm-snap.h5
-rw-r--r--drivers/md/dm-stripe.c6
-rw-r--r--drivers/md/dm-zero.c2
-rw-r--r--drivers/md/dm.c49
-rw-r--r--drivers/md/dm.h9
19 files changed, 977 insertions, 798 deletions
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index f1ef33dfd8cf..1c615804ea76 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -34,7 +34,7 @@ obj-$(CONFIG_DM_CRYPT) += dm-crypt.o
34obj-$(CONFIG_DM_DELAY) += dm-delay.o 34obj-$(CONFIG_DM_DELAY) += dm-delay.o
35obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o 35obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o
36obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o 36obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o
37obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o 37obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o
38obj-$(CONFIG_DM_ZERO) += dm-zero.o 38obj-$(CONFIG_DM_ZERO) += dm-zero.o
39 39
40quiet_cmd_unroll = UNROLL $@ 40quiet_cmd_unroll = UNROLL $@
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 682ef9e6acd3..ce26c84af064 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -23,7 +23,7 @@
23#include <asm/page.h> 23#include <asm/page.h>
24#include <asm/unaligned.h> 24#include <asm/unaligned.h>
25 25
26#include "dm.h" 26#include <linux/device-mapper.h>
27 27
28#define DM_MSG_PREFIX "crypt" 28#define DM_MSG_PREFIX "crypt"
29#define MESG_STR(x) x, sizeof(x) 29#define MESG_STR(x) x, sizeof(x)
@@ -56,6 +56,7 @@ struct dm_crypt_io {
56 atomic_t pending; 56 atomic_t pending;
57 int error; 57 int error;
58 sector_t sector; 58 sector_t sector;
59 struct dm_crypt_io *base_io;
59}; 60};
60 61
61struct dm_crypt_request { 62struct dm_crypt_request {
@@ -93,7 +94,6 @@ struct crypt_config {
93 94
94 struct workqueue_struct *io_queue; 95 struct workqueue_struct *io_queue;
95 struct workqueue_struct *crypt_queue; 96 struct workqueue_struct *crypt_queue;
96 wait_queue_head_t writeq;
97 97
98 /* 98 /*
99 * crypto related data 99 * crypto related data
@@ -534,6 +534,7 @@ static struct dm_crypt_io *crypt_io_alloc(struct dm_target *ti,
534 io->base_bio = bio; 534 io->base_bio = bio;
535 io->sector = sector; 535 io->sector = sector;
536 io->error = 0; 536 io->error = 0;
537 io->base_io = NULL;
537 atomic_set(&io->pending, 0); 538 atomic_set(&io->pending, 0);
538 539
539 return io; 540 return io;
@@ -547,6 +548,7 @@ static void crypt_inc_pending(struct dm_crypt_io *io)
547/* 548/*
548 * One of the bios was finished. Check for completion of 549 * One of the bios was finished. Check for completion of
549 * the whole request and correctly clean up the buffer. 550 * the whole request and correctly clean up the buffer.
551 * If base_io is set, wait for the last fragment to complete.
550 */ 552 */
551static void crypt_dec_pending(struct dm_crypt_io *io) 553static void crypt_dec_pending(struct dm_crypt_io *io)
552{ 554{
@@ -555,7 +557,14 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
555 if (!atomic_dec_and_test(&io->pending)) 557 if (!atomic_dec_and_test(&io->pending))
556 return; 558 return;
557 559
558 bio_endio(io->base_bio, io->error); 560 if (likely(!io->base_io))
561 bio_endio(io->base_bio, io->error);
562 else {
563 if (io->error && !io->base_io->error)
564 io->base_io->error = io->error;
565 crypt_dec_pending(io->base_io);
566 }
567
559 mempool_free(io, cc->io_pool); 568 mempool_free(io, cc->io_pool);
560} 569}
561 570
@@ -646,10 +655,7 @@ static void kcryptd_io_read(struct dm_crypt_io *io)
646static void kcryptd_io_write(struct dm_crypt_io *io) 655static void kcryptd_io_write(struct dm_crypt_io *io)
647{ 656{
648 struct bio *clone = io->ctx.bio_out; 657 struct bio *clone = io->ctx.bio_out;
649 struct crypt_config *cc = io->target->private;
650
651 generic_make_request(clone); 658 generic_make_request(clone);
652 wake_up(&cc->writeq);
653} 659}
654 660
655static void kcryptd_io(struct work_struct *work) 661static void kcryptd_io(struct work_struct *work)
@@ -688,7 +694,6 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io,
688 BUG_ON(io->ctx.idx_out < clone->bi_vcnt); 694 BUG_ON(io->ctx.idx_out < clone->bi_vcnt);
689 695
690 clone->bi_sector = cc->start + io->sector; 696 clone->bi_sector = cc->start + io->sector;
691 io->sector += bio_sectors(clone);
692 697
693 if (async) 698 if (async)
694 kcryptd_queue_io(io); 699 kcryptd_queue_io(io);
@@ -700,16 +705,18 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
700{ 705{
701 struct crypt_config *cc = io->target->private; 706 struct crypt_config *cc = io->target->private;
702 struct bio *clone; 707 struct bio *clone;
708 struct dm_crypt_io *new_io;
703 int crypt_finished; 709 int crypt_finished;
704 unsigned out_of_pages = 0; 710 unsigned out_of_pages = 0;
705 unsigned remaining = io->base_bio->bi_size; 711 unsigned remaining = io->base_bio->bi_size;
712 sector_t sector = io->sector;
706 int r; 713 int r;
707 714
708 /* 715 /*
709 * Prevent io from disappearing until this function completes. 716 * Prevent io from disappearing until this function completes.
710 */ 717 */
711 crypt_inc_pending(io); 718 crypt_inc_pending(io);
712 crypt_convert_init(cc, &io->ctx, NULL, io->base_bio, io->sector); 719 crypt_convert_init(cc, &io->ctx, NULL, io->base_bio, sector);
713 720
714 /* 721 /*
715 * The allocated buffers can be smaller than the whole bio, 722 * The allocated buffers can be smaller than the whole bio,
@@ -726,6 +733,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
726 io->ctx.idx_out = 0; 733 io->ctx.idx_out = 0;
727 734
728 remaining -= clone->bi_size; 735 remaining -= clone->bi_size;
736 sector += bio_sectors(clone);
729 737
730 crypt_inc_pending(io); 738 crypt_inc_pending(io);
731 r = crypt_convert(cc, &io->ctx); 739 r = crypt_convert(cc, &io->ctx);
@@ -741,6 +749,8 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
741 */ 749 */
742 if (unlikely(r < 0)) 750 if (unlikely(r < 0))
743 break; 751 break;
752
753 io->sector = sector;
744 } 754 }
745 755
746 /* 756 /*
@@ -750,8 +760,33 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
750 if (unlikely(out_of_pages)) 760 if (unlikely(out_of_pages))
751 congestion_wait(WRITE, HZ/100); 761 congestion_wait(WRITE, HZ/100);
752 762
753 if (unlikely(remaining)) 763 /*
754 wait_event(cc->writeq, !atomic_read(&io->ctx.pending)); 764 * With async crypto it is unsafe to share the crypto context
765 * between fragments, so switch to a new dm_crypt_io structure.
766 */
767 if (unlikely(!crypt_finished && remaining)) {
768 new_io = crypt_io_alloc(io->target, io->base_bio,
769 sector);
770 crypt_inc_pending(new_io);
771 crypt_convert_init(cc, &new_io->ctx, NULL,
772 io->base_bio, sector);
773 new_io->ctx.idx_in = io->ctx.idx_in;
774 new_io->ctx.offset_in = io->ctx.offset_in;
775
776 /*
777 * Fragments after the first use the base_io
778 * pending count.
779 */
780 if (!io->base_io)
781 new_io->base_io = io;
782 else {
783 new_io->base_io = io->base_io;
784 crypt_inc_pending(io->base_io);
785 crypt_dec_pending(io);
786 }
787
788 io = new_io;
789 }
755 } 790 }
756 791
757 crypt_dec_pending(io); 792 crypt_dec_pending(io);
@@ -1078,7 +1113,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1078 goto bad_crypt_queue; 1113 goto bad_crypt_queue;
1079 } 1114 }
1080 1115
1081 init_waitqueue_head(&cc->writeq);
1082 ti->private = cc; 1116 ti->private = cc;
1083 return 0; 1117 return 0;
1084 1118
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index bdd37f881c42..848b381f1173 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -13,7 +13,8 @@
13#include <linux/bio.h> 13#include <linux/bio.h>
14#include <linux/slab.h> 14#include <linux/slab.h>
15 15
16#include "dm.h" 16#include <linux/device-mapper.h>
17
17#include "dm-bio-list.h" 18#include "dm-bio-list.h"
18 19
19#define DM_MSG_PREFIX "delay" 20#define DM_MSG_PREFIX "delay"
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index 769ab677f8e0..01590f3e0009 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -7,7 +7,6 @@
7 * This file is released under the GPL. 7 * This file is released under the GPL.
8 */ 8 */
9 9
10#include "dm.h"
11#include "dm-snap.h" 10#include "dm-snap.h"
12 11
13#include <linux/mm.h> 12#include <linux/mm.h>
@@ -105,6 +104,11 @@ struct pstore {
105 void *area; 104 void *area;
106 105
107 /* 106 /*
107 * An area of zeros used to clear the next area.
108 */
109 void *zero_area;
110
111 /*
108 * Used to keep track of which metadata area the data in 112 * Used to keep track of which metadata area the data in
109 * 'chunk' refers to. 113 * 'chunk' refers to.
110 */ 114 */
@@ -149,6 +153,13 @@ static int alloc_area(struct pstore *ps)
149 if (!ps->area) 153 if (!ps->area)
150 return r; 154 return r;
151 155
156 ps->zero_area = vmalloc(len);
157 if (!ps->zero_area) {
158 vfree(ps->area);
159 return r;
160 }
161 memset(ps->zero_area, 0, len);
162
152 return 0; 163 return 0;
153} 164}
154 165
@@ -156,6 +167,8 @@ static void free_area(struct pstore *ps)
156{ 167{
157 vfree(ps->area); 168 vfree(ps->area);
158 ps->area = NULL; 169 ps->area = NULL;
170 vfree(ps->zero_area);
171 ps->zero_area = NULL;
159} 172}
160 173
161struct mdata_req { 174struct mdata_req {
@@ -220,25 +233,41 @@ static chunk_t area_location(struct pstore *ps, chunk_t area)
220 * Read or write a metadata area. Remembering to skip the first 233 * Read or write a metadata area. Remembering to skip the first
221 * chunk which holds the header. 234 * chunk which holds the header.
222 */ 235 */
223static int area_io(struct pstore *ps, chunk_t area, int rw) 236static int area_io(struct pstore *ps, int rw)
224{ 237{
225 int r; 238 int r;
226 chunk_t chunk; 239 chunk_t chunk;
227 240
228 chunk = area_location(ps, area); 241 chunk = area_location(ps, ps->current_area);
229 242
230 r = chunk_io(ps, chunk, rw, 0); 243 r = chunk_io(ps, chunk, rw, 0);
231 if (r) 244 if (r)
232 return r; 245 return r;
233 246
234 ps->current_area = area;
235 return 0; 247 return 0;
236} 248}
237 249
238static int zero_area(struct pstore *ps, chunk_t area) 250static void zero_memory_area(struct pstore *ps)
239{ 251{
240 memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT); 252 memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
241 return area_io(ps, area, WRITE); 253}
254
255static int zero_disk_area(struct pstore *ps, chunk_t area)
256{
257 struct dm_io_region where = {
258 .bdev = ps->snap->cow->bdev,
259 .sector = ps->snap->chunk_size * area_location(ps, area),
260 .count = ps->snap->chunk_size,
261 };
262 struct dm_io_request io_req = {
263 .bi_rw = WRITE,
264 .mem.type = DM_IO_VMA,
265 .mem.ptr.vma = ps->zero_area,
266 .client = ps->io_client,
267 .notify.fn = NULL,
268 };
269
270 return dm_io(&io_req, 1, &where, NULL);
242} 271}
243 272
244static int read_header(struct pstore *ps, int *new_snapshot) 273static int read_header(struct pstore *ps, int *new_snapshot)
@@ -411,15 +440,14 @@ static int insert_exceptions(struct pstore *ps, int *full)
411 440
412static int read_exceptions(struct pstore *ps) 441static int read_exceptions(struct pstore *ps)
413{ 442{
414 chunk_t area;
415 int r, full = 1; 443 int r, full = 1;
416 444
417 /* 445 /*
418 * Keeping reading chunks and inserting exceptions until 446 * Keeping reading chunks and inserting exceptions until
419 * we find a partially full area. 447 * we find a partially full area.
420 */ 448 */
421 for (area = 0; full; area++) { 449 for (ps->current_area = 0; full; ps->current_area++) {
422 r = area_io(ps, area, READ); 450 r = area_io(ps, READ);
423 if (r) 451 if (r)
424 return r; 452 return r;
425 453
@@ -428,6 +456,8 @@ static int read_exceptions(struct pstore *ps)
428 return r; 456 return r;
429 } 457 }
430 458
459 ps->current_area--;
460
431 return 0; 461 return 0;
432} 462}
433 463
@@ -486,12 +516,13 @@ static int persistent_read_metadata(struct exception_store *store)
486 return r; 516 return r;
487 } 517 }
488 518
489 r = zero_area(ps, 0); 519 ps->current_area = 0;
520 zero_memory_area(ps);
521 r = zero_disk_area(ps, 0);
490 if (r) { 522 if (r) {
491 DMWARN("zero_area(0) failed"); 523 DMWARN("zero_disk_area(0) failed");
492 return r; 524 return r;
493 } 525 }
494
495 } else { 526 } else {
496 /* 527 /*
497 * Sanity checks. 528 * Sanity checks.
@@ -551,7 +582,6 @@ static void persistent_commit(struct exception_store *store,
551 void (*callback) (void *, int success), 582 void (*callback) (void *, int success),
552 void *callback_context) 583 void *callback_context)
553{ 584{
554 int r;
555 unsigned int i; 585 unsigned int i;
556 struct pstore *ps = get_info(store); 586 struct pstore *ps = get_info(store);
557 struct disk_exception de; 587 struct disk_exception de;
@@ -572,33 +602,41 @@ static void persistent_commit(struct exception_store *store,
572 cb->context = callback_context; 602 cb->context = callback_context;
573 603
574 /* 604 /*
575 * If there are no more exceptions in flight, or we have 605 * If there are exceptions in flight and we have not yet
576 * filled this metadata area we commit the exceptions to 606 * filled this metadata area there's nothing more to do.
577 * disk.
578 */ 607 */
579 if (atomic_dec_and_test(&ps->pending_count) || 608 if (!atomic_dec_and_test(&ps->pending_count) &&
580 (ps->current_committed == ps->exceptions_per_area)) { 609 (ps->current_committed != ps->exceptions_per_area))
581 r = area_io(ps, ps->current_area, WRITE); 610 return;
582 if (r)
583 ps->valid = 0;
584 611
585 /* 612 /*
586 * Have we completely filled the current area ? 613 * If we completely filled the current area, then wipe the next one.
587 */ 614 */
588 if (ps->current_committed == ps->exceptions_per_area) { 615 if ((ps->current_committed == ps->exceptions_per_area) &&
589 ps->current_committed = 0; 616 zero_disk_area(ps, ps->current_area + 1))
590 r = zero_area(ps, ps->current_area + 1); 617 ps->valid = 0;
591 if (r)
592 ps->valid = 0;
593 }
594 618
595 for (i = 0; i < ps->callback_count; i++) { 619 /*
596 cb = ps->callbacks + i; 620 * Commit exceptions to disk.
597 cb->callback(cb->context, r == 0 ? 1 : 0); 621 */
598 } 622 if (ps->valid && area_io(ps, WRITE))
623 ps->valid = 0;
599 624
600 ps->callback_count = 0; 625 /*
626 * Advance to the next area if this one is full.
627 */
628 if (ps->current_committed == ps->exceptions_per_area) {
629 ps->current_committed = 0;
630 ps->current_area++;
631 zero_memory_area(ps);
601 } 632 }
633
634 for (i = 0; i < ps->callback_count; i++) {
635 cb = ps->callbacks + i;
636 cb->callback(cb->context, ps->valid);
637 }
638
639 ps->callback_count = 0;
602} 640}
603 641
604static void persistent_drop(struct exception_store *store) 642static void persistent_drop(struct exception_store *store)
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 4789c42d9a3a..2fd6d4450637 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -5,7 +5,7 @@
5 * This file is released under the GPL. 5 * This file is released under the GPL.
6 */ 6 */
7 7
8#include "dm.h" 8#include <linux/device-mapper.h>
9 9
10#include <linux/bio.h> 10#include <linux/bio.h>
11#include <linux/mempool.h> 11#include <linux/mempool.h>
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 996802b8a452..3073618269ea 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -22,6 +22,7 @@
22#include <linux/vmalloc.h> 22#include <linux/vmalloc.h>
23#include <linux/workqueue.h> 23#include <linux/workqueue.h>
24#include <linux/mutex.h> 24#include <linux/mutex.h>
25#include <linux/device-mapper.h>
25#include <linux/dm-kcopyd.h> 26#include <linux/dm-kcopyd.h>
26 27
27#include "dm.h" 28#include "dm.h"
@@ -268,6 +269,17 @@ static void push(struct list_head *jobs, struct kcopyd_job *job)
268 spin_unlock_irqrestore(&kc->job_lock, flags); 269 spin_unlock_irqrestore(&kc->job_lock, flags);
269} 270}
270 271
272
273static void push_head(struct list_head *jobs, struct kcopyd_job *job)
274{
275 unsigned long flags;
276 struct dm_kcopyd_client *kc = job->kc;
277
278 spin_lock_irqsave(&kc->job_lock, flags);
279 list_add(&job->list, jobs);
280 spin_unlock_irqrestore(&kc->job_lock, flags);
281}
282
271/* 283/*
272 * These three functions process 1 item from the corresponding 284 * These three functions process 1 item from the corresponding
273 * job list. 285 * job list.
@@ -398,7 +410,7 @@ static int process_jobs(struct list_head *jobs, struct dm_kcopyd_client *kc,
398 * We couldn't service this job ATM, so 410 * We couldn't service this job ATM, so
399 * push this job back onto the list. 411 * push this job back onto the list.
400 */ 412 */
401 push(jobs, job); 413 push_head(jobs, job);
402 break; 414 break;
403 } 415 }
404 416
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 6449bcdf84ca..1b29e9136758 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -5,12 +5,12 @@
5 */ 5 */
6 6
7#include "dm.h" 7#include "dm.h"
8
9#include <linux/module.h> 8#include <linux/module.h>
10#include <linux/init.h> 9#include <linux/init.h>
11#include <linux/blkdev.h> 10#include <linux/blkdev.h>
12#include <linux/bio.h> 11#include <linux/bio.h>
13#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/device-mapper.h>
14 14
15#define DM_MSG_PREFIX "linear" 15#define DM_MSG_PREFIX "linear"
16 16
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 5b48478c79f5..a8c0fc79ca78 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -12,7 +12,7 @@
12#include <linux/dm-io.h> 12#include <linux/dm-io.h>
13#include <linux/dm-dirty-log.h> 13#include <linux/dm-dirty-log.h>
14 14
15#include "dm.h" 15#include <linux/device-mapper.h>
16 16
17#define DM_MSG_PREFIX "dirty region log" 17#define DM_MSG_PREFIX "dirty region log"
18 18
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 9bf3460c5540..abf6e8cfaedb 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -5,7 +5,8 @@
5 * This file is released under the GPL. 5 * This file is released under the GPL.
6 */ 6 */
7 7
8#include "dm.h" 8#include <linux/device-mapper.h>
9
9#include "dm-path-selector.h" 10#include "dm-path-selector.h"
10#include "dm-bio-list.h" 11#include "dm-bio-list.h"
11#include "dm-bio-record.h" 12#include "dm-bio-record.h"
diff --git a/drivers/md/dm-path-selector.c b/drivers/md/dm-path-selector.c
index ca1bb636a3e4..96ea226155b1 100644
--- a/drivers/md/dm-path-selector.c
+++ b/drivers/md/dm-path-selector.c
@@ -9,7 +9,8 @@
9 * Path selector registration. 9 * Path selector registration.
10 */ 10 */
11 11
12#include "dm.h" 12#include <linux/device-mapper.h>
13
13#include "dm-path-selector.h" 14#include "dm-path-selector.h"
14 15
15#include <linux/slab.h> 16#include <linux/slab.h>
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 29913e42c4ab..92dcc06832a4 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1,30 +1,30 @@
1/* 1/*
2 * Copyright (C) 2003 Sistina Software Limited. 2 * Copyright (C) 2003 Sistina Software Limited.
3 * Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
3 * 4 *
4 * This file is released under the GPL. 5 * This file is released under the GPL.
5 */ 6 */
6 7
7#include "dm.h"
8#include "dm-bio-list.h" 8#include "dm-bio-list.h"
9#include "dm-bio-record.h" 9#include "dm-bio-record.h"
10 10
11#include <linux/ctype.h>
12#include <linux/init.h> 11#include <linux/init.h>
13#include <linux/mempool.h> 12#include <linux/mempool.h>
14#include <linux/module.h> 13#include <linux/module.h>
15#include <linux/pagemap.h> 14#include <linux/pagemap.h>
16#include <linux/slab.h> 15#include <linux/slab.h>
17#include <linux/time.h>
18#include <linux/vmalloc.h>
19#include <linux/workqueue.h> 16#include <linux/workqueue.h>
20#include <linux/log2.h> 17#include <linux/device-mapper.h>
21#include <linux/hardirq.h>
22#include <linux/dm-io.h> 18#include <linux/dm-io.h>
23#include <linux/dm-dirty-log.h> 19#include <linux/dm-dirty-log.h>
24#include <linux/dm-kcopyd.h> 20#include <linux/dm-kcopyd.h>
21#include <linux/dm-region-hash.h>
25 22
26#define DM_MSG_PREFIX "raid1" 23#define DM_MSG_PREFIX "raid1"
24
25#define MAX_RECOVERY 1 /* Maximum number of regions recovered in parallel. */
27#define DM_IO_PAGES 64 26#define DM_IO_PAGES 64
27#define DM_KCOPYD_PAGES 64
28 28
29#define DM_RAID1_HANDLE_ERRORS 0x01 29#define DM_RAID1_HANDLE_ERRORS 0x01
30#define errors_handled(p) ((p)->features & DM_RAID1_HANDLE_ERRORS) 30#define errors_handled(p) ((p)->features & DM_RAID1_HANDLE_ERRORS)
@@ -32,87 +32,6 @@
32static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped); 32static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped);
33 33
34/*----------------------------------------------------------------- 34/*-----------------------------------------------------------------
35 * Region hash
36 *
37 * The mirror splits itself up into discrete regions. Each
38 * region can be in one of three states: clean, dirty,
39 * nosync. There is no need to put clean regions in the hash.
40 *
41 * In addition to being present in the hash table a region _may_
42 * be present on one of three lists.
43 *
44 * clean_regions: Regions on this list have no io pending to
45 * them, they are in sync, we are no longer interested in them,
46 * they are dull. rh_update_states() will remove them from the
47 * hash table.
48 *
49 * quiesced_regions: These regions have been spun down, ready
50 * for recovery. rh_recovery_start() will remove regions from
51 * this list and hand them to kmirrord, which will schedule the
52 * recovery io with kcopyd.
53 *
54 * recovered_regions: Regions that kcopyd has successfully
55 * recovered. rh_update_states() will now schedule any delayed
56 * io, up the recovery_count, and remove the region from the
57 * hash.
58 *
59 * There are 2 locks:
60 * A rw spin lock 'hash_lock' protects just the hash table,
61 * this is never held in write mode from interrupt context,
62 * which I believe means that we only have to disable irqs when
63 * doing a write lock.
64 *
65 * An ordinary spin lock 'region_lock' that protects the three
66 * lists in the region_hash, with the 'state', 'list' and
67 * 'bhs_delayed' fields of the regions. This is used from irq
68 * context, so all other uses will have to suspend local irqs.
69 *---------------------------------------------------------------*/
70struct mirror_set;
71struct region_hash {
72 struct mirror_set *ms;
73 uint32_t region_size;
74 unsigned region_shift;
75
76 /* holds persistent region state */
77 struct dm_dirty_log *log;
78
79 /* hash table */
80 rwlock_t hash_lock;
81 mempool_t *region_pool;
82 unsigned int mask;
83 unsigned int nr_buckets;
84 struct list_head *buckets;
85
86 spinlock_t region_lock;
87 atomic_t recovery_in_flight;
88 struct semaphore recovery_count;
89 struct list_head clean_regions;
90 struct list_head quiesced_regions;
91 struct list_head recovered_regions;
92 struct list_head failed_recovered_regions;
93};
94
95enum {
96 RH_CLEAN,
97 RH_DIRTY,
98 RH_NOSYNC,
99 RH_RECOVERING
100};
101
102struct region {
103 struct region_hash *rh; /* FIXME: can we get rid of this ? */
104 region_t key;
105 int state;
106
107 struct list_head hash_list;
108 struct list_head list;
109
110 atomic_t pending;
111 struct bio_list delayed_bios;
112};
113
114
115/*-----------------------------------------------------------------
116 * Mirror set structures. 35 * Mirror set structures.
117 *---------------------------------------------------------------*/ 36 *---------------------------------------------------------------*/
118enum dm_raid1_error { 37enum dm_raid1_error {
@@ -132,8 +51,7 @@ struct mirror {
132struct mirror_set { 51struct mirror_set {
133 struct dm_target *ti; 52 struct dm_target *ti;
134 struct list_head list; 53 struct list_head list;
135 struct region_hash rh; 54
136 struct dm_kcopyd_client *kcopyd_client;
137 uint64_t features; 55 uint64_t features;
138 56
139 spinlock_t lock; /* protects the lists */ 57 spinlock_t lock; /* protects the lists */
@@ -141,6 +59,8 @@ struct mirror_set {
141 struct bio_list writes; 59 struct bio_list writes;
142 struct bio_list failures; 60 struct bio_list failures;
143 61
62 struct dm_region_hash *rh;
63 struct dm_kcopyd_client *kcopyd_client;
144 struct dm_io_client *io_client; 64 struct dm_io_client *io_client;
145 mempool_t *read_record_pool; 65 mempool_t *read_record_pool;
146 66
@@ -159,25 +79,14 @@ struct mirror_set {
159 79
160 struct work_struct trigger_event; 80 struct work_struct trigger_event;
161 81
162 unsigned int nr_mirrors; 82 unsigned nr_mirrors;
163 struct mirror mirror[0]; 83 struct mirror mirror[0];
164}; 84};
165 85
166/* 86static void wakeup_mirrord(void *context)
167 * Conversion fns
168 */
169static inline region_t bio_to_region(struct region_hash *rh, struct bio *bio)
170{
171 return (bio->bi_sector - rh->ms->ti->begin) >> rh->region_shift;
172}
173
174static inline sector_t region_to_sector(struct region_hash *rh, region_t region)
175{ 87{
176 return region << rh->region_shift; 88 struct mirror_set *ms = context;
177}
178 89
179static void wake(struct mirror_set *ms)
180{
181 queue_work(ms->kmirrord_wq, &ms->kmirrord_work); 90 queue_work(ms->kmirrord_wq, &ms->kmirrord_work);
182} 91}
183 92
@@ -186,7 +95,7 @@ static void delayed_wake_fn(unsigned long data)
186 struct mirror_set *ms = (struct mirror_set *) data; 95 struct mirror_set *ms = (struct mirror_set *) data;
187 96
188 clear_bit(0, &ms->timer_pending); 97 clear_bit(0, &ms->timer_pending);
189 wake(ms); 98 wakeup_mirrord(ms);
190} 99}
191 100
192static void delayed_wake(struct mirror_set *ms) 101static void delayed_wake(struct mirror_set *ms)
@@ -200,473 +109,34 @@ static void delayed_wake(struct mirror_set *ms)
200 add_timer(&ms->timer); 109 add_timer(&ms->timer);
201} 110}
202 111
203/* FIXME move this */ 112static void wakeup_all_recovery_waiters(void *context)
204static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw);
205
206#define MIN_REGIONS 64
207#define MAX_RECOVERY 1
208static int rh_init(struct region_hash *rh, struct mirror_set *ms,
209 struct dm_dirty_log *log, uint32_t region_size,
210 region_t nr_regions)
211{ 113{
212 unsigned int nr_buckets, max_buckets; 114 wake_up_all(&_kmirrord_recovery_stopped);
213 size_t i;
214
215 /*
216 * Calculate a suitable number of buckets for our hash
217 * table.
218 */
219 max_buckets = nr_regions >> 6;
220 for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1)
221 ;
222 nr_buckets >>= 1;
223
224 rh->ms = ms;
225 rh->log = log;
226 rh->region_size = region_size;
227 rh->region_shift = ffs(region_size) - 1;
228 rwlock_init(&rh->hash_lock);
229 rh->mask = nr_buckets - 1;
230 rh->nr_buckets = nr_buckets;
231
232 rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets));
233 if (!rh->buckets) {
234 DMERR("unable to allocate region hash memory");
235 return -ENOMEM;
236 }
237
238 for (i = 0; i < nr_buckets; i++)
239 INIT_LIST_HEAD(rh->buckets + i);
240
241 spin_lock_init(&rh->region_lock);
242 sema_init(&rh->recovery_count, 0);
243 atomic_set(&rh->recovery_in_flight, 0);
244 INIT_LIST_HEAD(&rh->clean_regions);
245 INIT_LIST_HEAD(&rh->quiesced_regions);
246 INIT_LIST_HEAD(&rh->recovered_regions);
247 INIT_LIST_HEAD(&rh->failed_recovered_regions);
248
249 rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS,
250 sizeof(struct region));
251 if (!rh->region_pool) {
252 vfree(rh->buckets);
253 rh->buckets = NULL;
254 return -ENOMEM;
255 }
256
257 return 0;
258} 115}
259 116
260static void rh_exit(struct region_hash *rh) 117static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw)
261{
262 unsigned int h;
263 struct region *reg, *nreg;
264
265 BUG_ON(!list_empty(&rh->quiesced_regions));
266 for (h = 0; h < rh->nr_buckets; h++) {
267 list_for_each_entry_safe(reg, nreg, rh->buckets + h, hash_list) {
268 BUG_ON(atomic_read(&reg->pending));
269 mempool_free(reg, rh->region_pool);
270 }
271 }
272
273 if (rh->log)
274 dm_dirty_log_destroy(rh->log);
275 if (rh->region_pool)
276 mempool_destroy(rh->region_pool);
277 vfree(rh->buckets);
278}
279
280#define RH_HASH_MULT 2654435387U
281
282static inline unsigned int rh_hash(struct region_hash *rh, region_t region)
283{
284 return (unsigned int) ((region * RH_HASH_MULT) >> 12) & rh->mask;
285}
286
287static struct region *__rh_lookup(struct region_hash *rh, region_t region)
288{
289 struct region *reg;
290
291 list_for_each_entry (reg, rh->buckets + rh_hash(rh, region), hash_list)
292 if (reg->key == region)
293 return reg;
294
295 return NULL;
296}
297
298static void __rh_insert(struct region_hash *rh, struct region *reg)
299{
300 unsigned int h = rh_hash(rh, reg->key);
301 list_add(&reg->hash_list, rh->buckets + h);
302}
303
304static struct region *__rh_alloc(struct region_hash *rh, region_t region)
305{
306 struct region *reg, *nreg;
307
308 read_unlock(&rh->hash_lock);
309 nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC);
310 if (unlikely(!nreg))
311 nreg = kmalloc(sizeof(struct region), GFP_NOIO);
312 nreg->state = rh->log->type->in_sync(rh->log, region, 1) ?
313 RH_CLEAN : RH_NOSYNC;
314 nreg->rh = rh;
315 nreg->key = region;
316
317 INIT_LIST_HEAD(&nreg->list);
318
319 atomic_set(&nreg->pending, 0);
320 bio_list_init(&nreg->delayed_bios);
321 write_lock_irq(&rh->hash_lock);
322
323 reg = __rh_lookup(rh, region);
324 if (reg)
325 /* we lost the race */
326 mempool_free(nreg, rh->region_pool);
327
328 else {
329 __rh_insert(rh, nreg);
330 if (nreg->state == RH_CLEAN) {
331 spin_lock(&rh->region_lock);
332 list_add(&nreg->list, &rh->clean_regions);
333 spin_unlock(&rh->region_lock);
334 }
335 reg = nreg;
336 }
337 write_unlock_irq(&rh->hash_lock);
338 read_lock(&rh->hash_lock);
339
340 return reg;
341}
342
343static inline struct region *__rh_find(struct region_hash *rh, region_t region)
344{
345 struct region *reg;
346
347 reg = __rh_lookup(rh, region);
348 if (!reg)
349 reg = __rh_alloc(rh, region);
350
351 return reg;
352}
353
354static int rh_state(struct region_hash *rh, region_t region, int may_block)
355{
356 int r;
357 struct region *reg;
358
359 read_lock(&rh->hash_lock);
360 reg = __rh_lookup(rh, region);
361 read_unlock(&rh->hash_lock);
362
363 if (reg)
364 return reg->state;
365
366 /*
367 * The region wasn't in the hash, so we fall back to the
368 * dirty log.
369 */
370 r = rh->log->type->in_sync(rh->log, region, may_block);
371
372 /*
373 * Any error from the dirty log (eg. -EWOULDBLOCK) gets
374 * taken as a RH_NOSYNC
375 */
376 return r == 1 ? RH_CLEAN : RH_NOSYNC;
377}
378
379static inline int rh_in_sync(struct region_hash *rh,
380 region_t region, int may_block)
381{
382 int state = rh_state(rh, region, may_block);
383 return state == RH_CLEAN || state == RH_DIRTY;
384}
385
386static void dispatch_bios(struct mirror_set *ms, struct bio_list *bio_list)
387{
388 struct bio *bio;
389
390 while ((bio = bio_list_pop(bio_list))) {
391 queue_bio(ms, bio, WRITE);
392 }
393}
394
395static void complete_resync_work(struct region *reg, int success)
396{
397 struct region_hash *rh = reg->rh;
398
399 rh->log->type->set_region_sync(rh->log, reg->key, success);
400
401 /*
402 * Dispatch the bios before we call 'wake_up_all'.
403 * This is important because if we are suspending,
404 * we want to know that recovery is complete and
405 * the work queue is flushed. If we wake_up_all
406 * before we dispatch_bios (queue bios and call wake()),
407 * then we risk suspending before the work queue
408 * has been properly flushed.
409 */
410 dispatch_bios(rh->ms, &reg->delayed_bios);
411 if (atomic_dec_and_test(&rh->recovery_in_flight))
412 wake_up_all(&_kmirrord_recovery_stopped);
413 up(&rh->recovery_count);
414}
415
416static void rh_update_states(struct region_hash *rh)
417{
418 struct region *reg, *next;
419
420 LIST_HEAD(clean);
421 LIST_HEAD(recovered);
422 LIST_HEAD(failed_recovered);
423
424 /*
425 * Quickly grab the lists.
426 */
427 write_lock_irq(&rh->hash_lock);
428 spin_lock(&rh->region_lock);
429 if (!list_empty(&rh->clean_regions)) {
430 list_splice_init(&rh->clean_regions, &clean);
431
432 list_for_each_entry(reg, &clean, list)
433 list_del(&reg->hash_list);
434 }
435
436 if (!list_empty(&rh->recovered_regions)) {
437 list_splice_init(&rh->recovered_regions, &recovered);
438
439 list_for_each_entry (reg, &recovered, list)
440 list_del(&reg->hash_list);
441 }
442
443 if (!list_empty(&rh->failed_recovered_regions)) {
444 list_splice_init(&rh->failed_recovered_regions,
445 &failed_recovered);
446
447 list_for_each_entry(reg, &failed_recovered, list)
448 list_del(&reg->hash_list);
449 }
450
451 spin_unlock(&rh->region_lock);
452 write_unlock_irq(&rh->hash_lock);
453
454 /*
455 * All the regions on the recovered and clean lists have
456 * now been pulled out of the system, so no need to do
457 * any more locking.
458 */
459 list_for_each_entry_safe (reg, next, &recovered, list) {
460 rh->log->type->clear_region(rh->log, reg->key);
461 complete_resync_work(reg, 1);
462 mempool_free(reg, rh->region_pool);
463 }
464
465 list_for_each_entry_safe(reg, next, &failed_recovered, list) {
466 complete_resync_work(reg, errors_handled(rh->ms) ? 0 : 1);
467 mempool_free(reg, rh->region_pool);
468 }
469
470 list_for_each_entry_safe(reg, next, &clean, list) {
471 rh->log->type->clear_region(rh->log, reg->key);
472 mempool_free(reg, rh->region_pool);
473 }
474
475 rh->log->type->flush(rh->log);
476}
477
478static void rh_inc(struct region_hash *rh, region_t region)
479{
480 struct region *reg;
481
482 read_lock(&rh->hash_lock);
483 reg = __rh_find(rh, region);
484
485 spin_lock_irq(&rh->region_lock);
486 atomic_inc(&reg->pending);
487
488 if (reg->state == RH_CLEAN) {
489 reg->state = RH_DIRTY;
490 list_del_init(&reg->list); /* take off the clean list */
491 spin_unlock_irq(&rh->region_lock);
492
493 rh->log->type->mark_region(rh->log, reg->key);
494 } else
495 spin_unlock_irq(&rh->region_lock);
496
497
498 read_unlock(&rh->hash_lock);
499}
500
501static void rh_inc_pending(struct region_hash *rh, struct bio_list *bios)
502{
503 struct bio *bio;
504
505 for (bio = bios->head; bio; bio = bio->bi_next)
506 rh_inc(rh, bio_to_region(rh, bio));
507}
508
509static void rh_dec(struct region_hash *rh, region_t region)
510{ 118{
511 unsigned long flags; 119 unsigned long flags;
512 struct region *reg;
513 int should_wake = 0; 120 int should_wake = 0;
121 struct bio_list *bl;
514 122
515 read_lock(&rh->hash_lock); 123 bl = (rw == WRITE) ? &ms->writes : &ms->reads;
516 reg = __rh_lookup(rh, region); 124 spin_lock_irqsave(&ms->lock, flags);
517 read_unlock(&rh->hash_lock); 125 should_wake = !(bl->head);
518 126 bio_list_add(bl, bio);
519 spin_lock_irqsave(&rh->region_lock, flags); 127 spin_unlock_irqrestore(&ms->lock, flags);
520 if (atomic_dec_and_test(&reg->pending)) {
521 /*
522 * There is no pending I/O for this region.
523 * We can move the region to corresponding list for next action.
524 * At this point, the region is not yet connected to any list.
525 *
526 * If the state is RH_NOSYNC, the region should be kept off
527 * from clean list.
528 * The hash entry for RH_NOSYNC will remain in memory
529 * until the region is recovered or the map is reloaded.
530 */
531
532 /* do nothing for RH_NOSYNC */
533 if (reg->state == RH_RECOVERING) {
534 list_add_tail(&reg->list, &rh->quiesced_regions);
535 } else if (reg->state == RH_DIRTY) {
536 reg->state = RH_CLEAN;
537 list_add(&reg->list, &rh->clean_regions);
538 }
539 should_wake = 1;
540 }
541 spin_unlock_irqrestore(&rh->region_lock, flags);
542 128
543 if (should_wake) 129 if (should_wake)
544 wake(rh->ms); 130 wakeup_mirrord(ms);
545}
546
547/*
548 * Starts quiescing a region in preparation for recovery.
549 */
550static int __rh_recovery_prepare(struct region_hash *rh)
551{
552 int r;
553 struct region *reg;
554 region_t region;
555
556 /*
557 * Ask the dirty log what's next.
558 */
559 r = rh->log->type->get_resync_work(rh->log, &region);
560 if (r <= 0)
561 return r;
562
563 /*
564 * Get this region, and start it quiescing by setting the
565 * recovering flag.
566 */
567 read_lock(&rh->hash_lock);
568 reg = __rh_find(rh, region);
569 read_unlock(&rh->hash_lock);
570
571 spin_lock_irq(&rh->region_lock);
572 reg->state = RH_RECOVERING;
573
574 /* Already quiesced ? */
575 if (atomic_read(&reg->pending))
576 list_del_init(&reg->list);
577 else
578 list_move(&reg->list, &rh->quiesced_regions);
579
580 spin_unlock_irq(&rh->region_lock);
581
582 return 1;
583}
584
585static void rh_recovery_prepare(struct region_hash *rh)
586{
587 /* Extra reference to avoid race with rh_stop_recovery */
588 atomic_inc(&rh->recovery_in_flight);
589
590 while (!down_trylock(&rh->recovery_count)) {
591 atomic_inc(&rh->recovery_in_flight);
592 if (__rh_recovery_prepare(rh) <= 0) {
593 atomic_dec(&rh->recovery_in_flight);
594 up(&rh->recovery_count);
595 break;
596 }
597 }
598
599 /* Drop the extra reference */
600 if (atomic_dec_and_test(&rh->recovery_in_flight))
601 wake_up_all(&_kmirrord_recovery_stopped);
602}
603
604/*
605 * Returns any quiesced regions.
606 */
607static struct region *rh_recovery_start(struct region_hash *rh)
608{
609 struct region *reg = NULL;
610
611 spin_lock_irq(&rh->region_lock);
612 if (!list_empty(&rh->quiesced_regions)) {
613 reg = list_entry(rh->quiesced_regions.next,
614 struct region, list);
615 list_del_init(&reg->list); /* remove from the quiesced list */
616 }
617 spin_unlock_irq(&rh->region_lock);
618
619 return reg;
620}
621
622static void rh_recovery_end(struct region *reg, int success)
623{
624 struct region_hash *rh = reg->rh;
625
626 spin_lock_irq(&rh->region_lock);
627 if (success)
628 list_add(&reg->list, &reg->rh->recovered_regions);
629 else {
630 reg->state = RH_NOSYNC;
631 list_add(&reg->list, &reg->rh->failed_recovered_regions);
632 }
633 spin_unlock_irq(&rh->region_lock);
634
635 wake(rh->ms);
636} 131}
637 132
638static int rh_flush(struct region_hash *rh) 133static void dispatch_bios(void *context, struct bio_list *bio_list)
639{ 134{
640 return rh->log->type->flush(rh->log); 135 struct mirror_set *ms = context;
641} 136 struct bio *bio;
642
643static void rh_delay(struct region_hash *rh, struct bio *bio)
644{
645 struct region *reg;
646
647 read_lock(&rh->hash_lock);
648 reg = __rh_find(rh, bio_to_region(rh, bio));
649 bio_list_add(&reg->delayed_bios, bio);
650 read_unlock(&rh->hash_lock);
651}
652
653static void rh_stop_recovery(struct region_hash *rh)
654{
655 int i;
656
657 /* wait for any recovering regions */
658 for (i = 0; i < MAX_RECOVERY; i++)
659 down(&rh->recovery_count);
660}
661
662static void rh_start_recovery(struct region_hash *rh)
663{
664 int i;
665
666 for (i = 0; i < MAX_RECOVERY; i++)
667 up(&rh->recovery_count);
668 137
669 wake(rh->ms); 138 while ((bio = bio_list_pop(bio_list)))
139 queue_bio(ms, bio, WRITE);
670} 140}
671 141
672#define MIN_READ_RECORDS 20 142#define MIN_READ_RECORDS 20
@@ -776,8 +246,8 @@ out:
776static void recovery_complete(int read_err, unsigned long write_err, 246static void recovery_complete(int read_err, unsigned long write_err,
777 void *context) 247 void *context)
778{ 248{
779 struct region *reg = (struct region *)context; 249 struct dm_region *reg = context;
780 struct mirror_set *ms = reg->rh->ms; 250 struct mirror_set *ms = dm_rh_region_context(reg);
781 int m, bit = 0; 251 int m, bit = 0;
782 252
783 if (read_err) { 253 if (read_err) {
@@ -803,31 +273,33 @@ static void recovery_complete(int read_err, unsigned long write_err,
803 } 273 }
804 } 274 }
805 275
806 rh_recovery_end(reg, !(read_err || write_err)); 276 dm_rh_recovery_end(reg, !(read_err || write_err));
807} 277}
808 278
809static int recover(struct mirror_set *ms, struct region *reg) 279static int recover(struct mirror_set *ms, struct dm_region *reg)
810{ 280{
811 int r; 281 int r;
812 unsigned int i; 282 unsigned i;
813 struct dm_io_region from, to[DM_KCOPYD_MAX_REGIONS], *dest; 283 struct dm_io_region from, to[DM_KCOPYD_MAX_REGIONS], *dest;
814 struct mirror *m; 284 struct mirror *m;
815 unsigned long flags = 0; 285 unsigned long flags = 0;
286 region_t key = dm_rh_get_region_key(reg);
287 sector_t region_size = dm_rh_get_region_size(ms->rh);
816 288
817 /* fill in the source */ 289 /* fill in the source */
818 m = get_default_mirror(ms); 290 m = get_default_mirror(ms);
819 from.bdev = m->dev->bdev; 291 from.bdev = m->dev->bdev;
820 from.sector = m->offset + region_to_sector(reg->rh, reg->key); 292 from.sector = m->offset + dm_rh_region_to_sector(ms->rh, key);
821 if (reg->key == (ms->nr_regions - 1)) { 293 if (key == (ms->nr_regions - 1)) {
822 /* 294 /*
823 * The final region may be smaller than 295 * The final region may be smaller than
824 * region_size. 296 * region_size.
825 */ 297 */
826 from.count = ms->ti->len & (reg->rh->region_size - 1); 298 from.count = ms->ti->len & (region_size - 1);
827 if (!from.count) 299 if (!from.count)
828 from.count = reg->rh->region_size; 300 from.count = region_size;
829 } else 301 } else
830 from.count = reg->rh->region_size; 302 from.count = region_size;
831 303
832 /* fill in the destinations */ 304 /* fill in the destinations */
833 for (i = 0, dest = to; i < ms->nr_mirrors; i++) { 305 for (i = 0, dest = to; i < ms->nr_mirrors; i++) {
@@ -836,7 +308,7 @@ static int recover(struct mirror_set *ms, struct region *reg)
836 308
837 m = ms->mirror + i; 309 m = ms->mirror + i;
838 dest->bdev = m->dev->bdev; 310 dest->bdev = m->dev->bdev;
839 dest->sector = m->offset + region_to_sector(reg->rh, reg->key); 311 dest->sector = m->offset + dm_rh_region_to_sector(ms->rh, key);
840 dest->count = from.count; 312 dest->count = from.count;
841 dest++; 313 dest++;
842 } 314 }
@@ -853,22 +325,22 @@ static int recover(struct mirror_set *ms, struct region *reg)
853 325
854static void do_recovery(struct mirror_set *ms) 326static void do_recovery(struct mirror_set *ms)
855{ 327{
328 struct dm_region *reg;
329 struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
856 int r; 330 int r;
857 struct region *reg;
858 struct dm_dirty_log *log = ms->rh.log;
859 331
860 /* 332 /*
861 * Start quiescing some regions. 333 * Start quiescing some regions.
862 */ 334 */
863 rh_recovery_prepare(&ms->rh); 335 dm_rh_recovery_prepare(ms->rh);
864 336
865 /* 337 /*
866 * Copy any already quiesced regions. 338 * Copy any already quiesced regions.
867 */ 339 */
868 while ((reg = rh_recovery_start(&ms->rh))) { 340 while ((reg = dm_rh_recovery_start(ms->rh))) {
869 r = recover(ms, reg); 341 r = recover(ms, reg);
870 if (r) 342 if (r)
871 rh_recovery_end(reg, 0); 343 dm_rh_recovery_end(reg, 0);
872 } 344 }
873 345
874 /* 346 /*
@@ -909,9 +381,10 @@ static int default_ok(struct mirror *m)
909 381
910static int mirror_available(struct mirror_set *ms, struct bio *bio) 382static int mirror_available(struct mirror_set *ms, struct bio *bio)
911{ 383{
912 region_t region = bio_to_region(&ms->rh, bio); 384 struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
385 region_t region = dm_rh_bio_to_region(ms->rh, bio);
913 386
914 if (ms->rh.log->type->in_sync(ms->rh.log, region, 0)) 387 if (log->type->in_sync(log, region, 0))
915 return choose_mirror(ms, bio->bi_sector) ? 1 : 0; 388 return choose_mirror(ms, bio->bi_sector) ? 1 : 0;
916 389
917 return 0; 390 return 0;
@@ -985,7 +458,14 @@ static void read_async_bio(struct mirror *m, struct bio *bio)
985 458
986 map_region(&io, m, bio); 459 map_region(&io, m, bio);
987 bio_set_m(bio, m); 460 bio_set_m(bio, m);
988 (void) dm_io(&io_req, 1, &io, NULL); 461 BUG_ON(dm_io(&io_req, 1, &io, NULL));
462}
463
464static inline int region_in_sync(struct mirror_set *ms, region_t region,
465 int may_block)
466{
467 int state = dm_rh_get_state(ms->rh, region, may_block);
468 return state == DM_RH_CLEAN || state == DM_RH_DIRTY;
989} 469}
990 470
991static void do_reads(struct mirror_set *ms, struct bio_list *reads) 471static void do_reads(struct mirror_set *ms, struct bio_list *reads)
@@ -995,13 +475,13 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads)
995 struct mirror *m; 475 struct mirror *m;
996 476
997 while ((bio = bio_list_pop(reads))) { 477 while ((bio = bio_list_pop(reads))) {
998 region = bio_to_region(&ms->rh, bio); 478 region = dm_rh_bio_to_region(ms->rh, bio);
999 m = get_default_mirror(ms); 479 m = get_default_mirror(ms);
1000 480
1001 /* 481 /*
1002 * We can only read balance if the region is in sync. 482 * We can only read balance if the region is in sync.
1003 */ 483 */
1004 if (likely(rh_in_sync(&ms->rh, region, 1))) 484 if (likely(region_in_sync(ms, region, 1)))
1005 m = choose_mirror(ms, bio->bi_sector); 485 m = choose_mirror(ms, bio->bi_sector);
1006 else if (m && atomic_read(&m->error_count)) 486 else if (m && atomic_read(&m->error_count))
1007 m = NULL; 487 m = NULL;
@@ -1024,57 +504,6 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads)
1024 * NOSYNC: increment pending, just write to the default mirror 504 * NOSYNC: increment pending, just write to the default mirror
1025 *---------------------------------------------------------------*/ 505 *---------------------------------------------------------------*/
1026 506
1027/* __bio_mark_nosync
1028 * @ms
1029 * @bio
1030 * @done
1031 * @error
1032 *
1033 * The bio was written on some mirror(s) but failed on other mirror(s).
1034 * We can successfully endio the bio but should avoid the region being
1035 * marked clean by setting the state RH_NOSYNC.
1036 *
1037 * This function is _not_ safe in interrupt context!
1038 */
1039static void __bio_mark_nosync(struct mirror_set *ms,
1040 struct bio *bio, unsigned done, int error)
1041{
1042 unsigned long flags;
1043 struct region_hash *rh = &ms->rh;
1044 struct dm_dirty_log *log = ms->rh.log;
1045 struct region *reg;
1046 region_t region = bio_to_region(rh, bio);
1047 int recovering = 0;
1048
1049 /* We must inform the log that the sync count has changed. */
1050 log->type->set_region_sync(log, region, 0);
1051 ms->in_sync = 0;
1052
1053 read_lock(&rh->hash_lock);
1054 reg = __rh_find(rh, region);
1055 read_unlock(&rh->hash_lock);
1056
1057 /* region hash entry should exist because write was in-flight */
1058 BUG_ON(!reg);
1059 BUG_ON(!list_empty(&reg->list));
1060
1061 spin_lock_irqsave(&rh->region_lock, flags);
1062 /*
1063 * Possible cases:
1064 * 1) RH_DIRTY
1065 * 2) RH_NOSYNC: was dirty, other preceeding writes failed
1066 * 3) RH_RECOVERING: flushing pending writes
1067 * Either case, the region should have not been connected to list.
1068 */
1069 recovering = (reg->state == RH_RECOVERING);
1070 reg->state = RH_NOSYNC;
1071 BUG_ON(!list_empty(&reg->list));
1072 spin_unlock_irqrestore(&rh->region_lock, flags);
1073
1074 bio_endio(bio, error);
1075 if (recovering)
1076 complete_resync_work(reg, 0);
1077}
1078 507
1079static void write_callback(unsigned long error, void *context) 508static void write_callback(unsigned long error, void *context)
1080{ 509{
@@ -1119,7 +548,7 @@ static void write_callback(unsigned long error, void *context)
1119 bio_list_add(&ms->failures, bio); 548 bio_list_add(&ms->failures, bio);
1120 spin_unlock_irqrestore(&ms->lock, flags); 549 spin_unlock_irqrestore(&ms->lock, flags);
1121 if (should_wake) 550 if (should_wake)
1122 wake(ms); 551 wakeup_mirrord(ms);
1123 return; 552 return;
1124 } 553 }
1125out: 554out:
@@ -1149,7 +578,7 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
1149 */ 578 */
1150 bio_set_m(bio, get_default_mirror(ms)); 579 bio_set_m(bio, get_default_mirror(ms));
1151 580
1152 (void) dm_io(&io_req, ms->nr_mirrors, io, NULL); 581 BUG_ON(dm_io(&io_req, ms->nr_mirrors, io, NULL));
1153} 582}
1154 583
1155static void do_writes(struct mirror_set *ms, struct bio_list *writes) 584static void do_writes(struct mirror_set *ms, struct bio_list *writes)
@@ -1169,18 +598,19 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
1169 bio_list_init(&recover); 598 bio_list_init(&recover);
1170 599
1171 while ((bio = bio_list_pop(writes))) { 600 while ((bio = bio_list_pop(writes))) {
1172 state = rh_state(&ms->rh, bio_to_region(&ms->rh, bio), 1); 601 state = dm_rh_get_state(ms->rh,
602 dm_rh_bio_to_region(ms->rh, bio), 1);
1173 switch (state) { 603 switch (state) {
1174 case RH_CLEAN: 604 case DM_RH_CLEAN:
1175 case RH_DIRTY: 605 case DM_RH_DIRTY:
1176 this_list = &sync; 606 this_list = &sync;
1177 break; 607 break;
1178 608
1179 case RH_NOSYNC: 609 case DM_RH_NOSYNC:
1180 this_list = &nosync; 610 this_list = &nosync;
1181 break; 611 break;
1182 612
1183 case RH_RECOVERING: 613 case DM_RH_RECOVERING:
1184 this_list = &recover; 614 this_list = &recover;
1185 break; 615 break;
1186 } 616 }
@@ -1193,9 +623,9 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
1193 * be written to (writes to recover regions are going to 623 * be written to (writes to recover regions are going to
1194 * be delayed). 624 * be delayed).
1195 */ 625 */
1196 rh_inc_pending(&ms->rh, &sync); 626 dm_rh_inc_pending(ms->rh, &sync);
1197 rh_inc_pending(&ms->rh, &nosync); 627 dm_rh_inc_pending(ms->rh, &nosync);
1198 ms->log_failure = rh_flush(&ms->rh) ? 1 : 0; 628 ms->log_failure = dm_rh_flush(ms->rh) ? 1 : 0;
1199 629
1200 /* 630 /*
1201 * Dispatch io. 631 * Dispatch io.
@@ -1204,13 +634,13 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
1204 spin_lock_irq(&ms->lock); 634 spin_lock_irq(&ms->lock);
1205 bio_list_merge(&ms->failures, &sync); 635 bio_list_merge(&ms->failures, &sync);
1206 spin_unlock_irq(&ms->lock); 636 spin_unlock_irq(&ms->lock);
1207 wake(ms); 637 wakeup_mirrord(ms);
1208 } else 638 } else
1209 while ((bio = bio_list_pop(&sync))) 639 while ((bio = bio_list_pop(&sync)))
1210 do_write(ms, bio); 640 do_write(ms, bio);
1211 641
1212 while ((bio = bio_list_pop(&recover))) 642 while ((bio = bio_list_pop(&recover)))
1213 rh_delay(&ms->rh, bio); 643 dm_rh_delay(ms->rh, bio);
1214 644
1215 while ((bio = bio_list_pop(&nosync))) { 645 while ((bio = bio_list_pop(&nosync))) {
1216 map_bio(get_default_mirror(ms), bio); 646 map_bio(get_default_mirror(ms), bio);
@@ -1227,7 +657,8 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures)
1227 657
1228 if (!ms->log_failure) { 658 if (!ms->log_failure) {
1229 while ((bio = bio_list_pop(failures))) 659 while ((bio = bio_list_pop(failures)))
1230 __bio_mark_nosync(ms, bio, bio->bi_size, 0); 660 ms->in_sync = 0;
661 dm_rh_mark_nosync(ms->rh, bio, bio->bi_size, 0);
1231 return; 662 return;
1232 } 663 }
1233 664
@@ -1280,8 +711,8 @@ static void trigger_event(struct work_struct *work)
1280 *---------------------------------------------------------------*/ 711 *---------------------------------------------------------------*/
1281static void do_mirror(struct work_struct *work) 712static void do_mirror(struct work_struct *work)
1282{ 713{
1283 struct mirror_set *ms =container_of(work, struct mirror_set, 714 struct mirror_set *ms = container_of(work, struct mirror_set,
1284 kmirrord_work); 715 kmirrord_work);
1285 struct bio_list reads, writes, failures; 716 struct bio_list reads, writes, failures;
1286 unsigned long flags; 717 unsigned long flags;
1287 718
@@ -1294,7 +725,7 @@ static void do_mirror(struct work_struct *work)
1294 bio_list_init(&ms->failures); 725 bio_list_init(&ms->failures);
1295 spin_unlock_irqrestore(&ms->lock, flags); 726 spin_unlock_irqrestore(&ms->lock, flags);
1296 727
1297 rh_update_states(&ms->rh); 728 dm_rh_update_states(ms->rh, errors_handled(ms));
1298 do_recovery(ms); 729 do_recovery(ms);
1299 do_reads(ms, &reads); 730 do_reads(ms, &reads);
1300 do_writes(ms, &writes); 731 do_writes(ms, &writes);
@@ -1303,7 +734,6 @@ static void do_mirror(struct work_struct *work)
1303 dm_table_unplug_all(ms->ti->table); 734 dm_table_unplug_all(ms->ti->table);
1304} 735}
1305 736
1306
1307/*----------------------------------------------------------------- 737/*-----------------------------------------------------------------
1308 * Target functions 738 * Target functions
1309 *---------------------------------------------------------------*/ 739 *---------------------------------------------------------------*/
@@ -1315,9 +745,6 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
1315 size_t len; 745 size_t len;
1316 struct mirror_set *ms = NULL; 746 struct mirror_set *ms = NULL;
1317 747
1318 if (array_too_big(sizeof(*ms), sizeof(ms->mirror[0]), nr_mirrors))
1319 return NULL;
1320
1321 len = sizeof(*ms) + (sizeof(ms->mirror[0]) * nr_mirrors); 748 len = sizeof(*ms) + (sizeof(ms->mirror[0]) * nr_mirrors);
1322 749
1323 ms = kzalloc(len, GFP_KERNEL); 750 ms = kzalloc(len, GFP_KERNEL);
@@ -1353,7 +780,11 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
1353 return NULL; 780 return NULL;
1354 } 781 }
1355 782
1356 if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) { 783 ms->rh = dm_region_hash_create(ms, dispatch_bios, wakeup_mirrord,
784 wakeup_all_recovery_waiters,
785 ms->ti->begin, MAX_RECOVERY,
786 dl, region_size, ms->nr_regions);
787 if (IS_ERR(ms->rh)) {
1357 ti->error = "Error creating dirty region hash"; 788 ti->error = "Error creating dirty region hash";
1358 dm_io_client_destroy(ms->io_client); 789 dm_io_client_destroy(ms->io_client);
1359 mempool_destroy(ms->read_record_pool); 790 mempool_destroy(ms->read_record_pool);
@@ -1371,7 +802,7 @@ static void free_context(struct mirror_set *ms, struct dm_target *ti,
1371 dm_put_device(ti, ms->mirror[m].dev); 802 dm_put_device(ti, ms->mirror[m].dev);
1372 803
1373 dm_io_client_destroy(ms->io_client); 804 dm_io_client_destroy(ms->io_client);
1374 rh_exit(&ms->rh); 805 dm_region_hash_destroy(ms->rh);
1375 mempool_destroy(ms->read_record_pool); 806 mempool_destroy(ms->read_record_pool);
1376 kfree(ms); 807 kfree(ms);
1377} 808}
@@ -1411,10 +842,10 @@ static int get_mirror(struct mirror_set *ms, struct dm_target *ti,
1411 * Create dirty log: log_type #log_params <log_params> 842 * Create dirty log: log_type #log_params <log_params>
1412 */ 843 */
1413static struct dm_dirty_log *create_dirty_log(struct dm_target *ti, 844static struct dm_dirty_log *create_dirty_log(struct dm_target *ti,
1414 unsigned int argc, char **argv, 845 unsigned argc, char **argv,
1415 unsigned int *args_used) 846 unsigned *args_used)
1416{ 847{
1417 unsigned int param_count; 848 unsigned param_count;
1418 struct dm_dirty_log *dl; 849 struct dm_dirty_log *dl;
1419 850
1420 if (argc < 2) { 851 if (argc < 2) {
@@ -1545,7 +976,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1545 } 976 }
1546 977
1547 ti->private = ms; 978 ti->private = ms;
1548 ti->split_io = ms->rh.region_size; 979 ti->split_io = dm_rh_get_region_size(ms->rh);
1549 980
1550 ms->kmirrord_wq = create_singlethread_workqueue("kmirrord"); 981 ms->kmirrord_wq = create_singlethread_workqueue("kmirrord");
1551 if (!ms->kmirrord_wq) { 982 if (!ms->kmirrord_wq) {
@@ -1580,11 +1011,11 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1580 goto err_destroy_wq; 1011 goto err_destroy_wq;
1581 } 1012 }
1582 1013
1583 r = dm_kcopyd_client_create(DM_IO_PAGES, &ms->kcopyd_client); 1014 r = dm_kcopyd_client_create(DM_KCOPYD_PAGES, &ms->kcopyd_client);
1584 if (r) 1015 if (r)
1585 goto err_destroy_wq; 1016 goto err_destroy_wq;
1586 1017
1587 wake(ms); 1018 wakeup_mirrord(ms);
1588 return 0; 1019 return 0;
1589 1020
1590err_destroy_wq: 1021err_destroy_wq:
@@ -1605,22 +1036,6 @@ static void mirror_dtr(struct dm_target *ti)
1605 free_context(ms, ti, ms->nr_mirrors); 1036 free_context(ms, ti, ms->nr_mirrors);
1606} 1037}
1607 1038
1608static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw)
1609{
1610 unsigned long flags;
1611 int should_wake = 0;
1612 struct bio_list *bl;
1613
1614 bl = (rw == WRITE) ? &ms->writes : &ms->reads;
1615 spin_lock_irqsave(&ms->lock, flags);
1616 should_wake = !(bl->head);
1617 bio_list_add(bl, bio);
1618 spin_unlock_irqrestore(&ms->lock, flags);
1619
1620 if (should_wake)
1621 wake(ms);
1622}
1623
1624/* 1039/*
1625 * Mirror mapping function 1040 * Mirror mapping function
1626 */ 1041 */
@@ -1631,16 +1046,16 @@ static int mirror_map(struct dm_target *ti, struct bio *bio,
1631 struct mirror *m; 1046 struct mirror *m;
1632 struct mirror_set *ms = ti->private; 1047 struct mirror_set *ms = ti->private;
1633 struct dm_raid1_read_record *read_record = NULL; 1048 struct dm_raid1_read_record *read_record = NULL;
1049 struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
1634 1050
1635 if (rw == WRITE) { 1051 if (rw == WRITE) {
1636 /* Save region for mirror_end_io() handler */ 1052 /* Save region for mirror_end_io() handler */
1637 map_context->ll = bio_to_region(&ms->rh, bio); 1053 map_context->ll = dm_rh_bio_to_region(ms->rh, bio);
1638 queue_bio(ms, bio, rw); 1054 queue_bio(ms, bio, rw);
1639 return DM_MAPIO_SUBMITTED; 1055 return DM_MAPIO_SUBMITTED;
1640 } 1056 }
1641 1057
1642 r = ms->rh.log->type->in_sync(ms->rh.log, 1058 r = log->type->in_sync(log, dm_rh_bio_to_region(ms->rh, bio), 0);
1643 bio_to_region(&ms->rh, bio), 0);
1644 if (r < 0 && r != -EWOULDBLOCK) 1059 if (r < 0 && r != -EWOULDBLOCK)
1645 return r; 1060 return r;
1646 1061
@@ -1688,7 +1103,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
1688 * We need to dec pending if this was a write. 1103 * We need to dec pending if this was a write.
1689 */ 1104 */
1690 if (rw == WRITE) { 1105 if (rw == WRITE) {
1691 rh_dec(&ms->rh, map_context->ll); 1106 dm_rh_dec(ms->rh, map_context->ll);
1692 return error; 1107 return error;
1693 } 1108 }
1694 1109
@@ -1744,7 +1159,7 @@ out:
1744static void mirror_presuspend(struct dm_target *ti) 1159static void mirror_presuspend(struct dm_target *ti)
1745{ 1160{
1746 struct mirror_set *ms = (struct mirror_set *) ti->private; 1161 struct mirror_set *ms = (struct mirror_set *) ti->private;
1747 struct dm_dirty_log *log = ms->rh.log; 1162 struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
1748 1163
1749 atomic_set(&ms->suspend, 1); 1164 atomic_set(&ms->suspend, 1);
1750 1165
@@ -1752,10 +1167,10 @@ static void mirror_presuspend(struct dm_target *ti)
1752 * We must finish up all the work that we've 1167 * We must finish up all the work that we've
1753 * generated (i.e. recovery work). 1168 * generated (i.e. recovery work).
1754 */ 1169 */
1755 rh_stop_recovery(&ms->rh); 1170 dm_rh_stop_recovery(ms->rh);
1756 1171
1757 wait_event(_kmirrord_recovery_stopped, 1172 wait_event(_kmirrord_recovery_stopped,
1758 !atomic_read(&ms->rh.recovery_in_flight)); 1173 !dm_rh_recovery_in_flight(ms->rh));
1759 1174
1760 if (log->type->presuspend && log->type->presuspend(log)) 1175 if (log->type->presuspend && log->type->presuspend(log))
1761 /* FIXME: need better error handling */ 1176 /* FIXME: need better error handling */
@@ -1773,7 +1188,7 @@ static void mirror_presuspend(struct dm_target *ti)
1773static void mirror_postsuspend(struct dm_target *ti) 1188static void mirror_postsuspend(struct dm_target *ti)
1774{ 1189{
1775 struct mirror_set *ms = ti->private; 1190 struct mirror_set *ms = ti->private;
1776 struct dm_dirty_log *log = ms->rh.log; 1191 struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
1777 1192
1778 if (log->type->postsuspend && log->type->postsuspend(log)) 1193 if (log->type->postsuspend && log->type->postsuspend(log))
1779 /* FIXME: need better error handling */ 1194 /* FIXME: need better error handling */
@@ -1783,13 +1198,13 @@ static void mirror_postsuspend(struct dm_target *ti)
1783static void mirror_resume(struct dm_target *ti) 1198static void mirror_resume(struct dm_target *ti)
1784{ 1199{
1785 struct mirror_set *ms = ti->private; 1200 struct mirror_set *ms = ti->private;
1786 struct dm_dirty_log *log = ms->rh.log; 1201 struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
1787 1202
1788 atomic_set(&ms->suspend, 0); 1203 atomic_set(&ms->suspend, 0);
1789 if (log->type->resume && log->type->resume(log)) 1204 if (log->type->resume && log->type->resume(log))
1790 /* FIXME: need better error handling */ 1205 /* FIXME: need better error handling */
1791 DMWARN("log resume failed"); 1206 DMWARN("log resume failed");
1792 rh_start_recovery(&ms->rh); 1207 dm_rh_start_recovery(ms->rh);
1793} 1208}
1794 1209
1795/* 1210/*
@@ -1821,7 +1236,7 @@ static int mirror_status(struct dm_target *ti, status_type_t type,
1821{ 1236{
1822 unsigned int m, sz = 0; 1237 unsigned int m, sz = 0;
1823 struct mirror_set *ms = (struct mirror_set *) ti->private; 1238 struct mirror_set *ms = (struct mirror_set *) ti->private;
1824 struct dm_dirty_log *log = ms->rh.log; 1239 struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
1825 char buffer[ms->nr_mirrors + 1]; 1240 char buffer[ms->nr_mirrors + 1];
1826 1241
1827 switch (type) { 1242 switch (type) {
@@ -1834,15 +1249,15 @@ static int mirror_status(struct dm_target *ti, status_type_t type,
1834 buffer[m] = '\0'; 1249 buffer[m] = '\0';
1835 1250
1836 DMEMIT("%llu/%llu 1 %s ", 1251 DMEMIT("%llu/%llu 1 %s ",
1837 (unsigned long long)log->type->get_sync_count(ms->rh.log), 1252 (unsigned long long)log->type->get_sync_count(log),
1838 (unsigned long long)ms->nr_regions, buffer); 1253 (unsigned long long)ms->nr_regions, buffer);
1839 1254
1840 sz += log->type->status(ms->rh.log, type, result+sz, maxlen-sz); 1255 sz += log->type->status(log, type, result+sz, maxlen-sz);
1841 1256
1842 break; 1257 break;
1843 1258
1844 case STATUSTYPE_TABLE: 1259 case STATUSTYPE_TABLE:
1845 sz = log->type->status(ms->rh.log, type, result, maxlen); 1260 sz = log->type->status(log, type, result, maxlen);
1846 1261
1847 DMEMIT("%d", ms->nr_mirrors); 1262 DMEMIT("%d", ms->nr_mirrors);
1848 for (m = 0; m < ms->nr_mirrors; m++) 1263 for (m = 0; m < ms->nr_mirrors; m++)
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
new file mode 100644
index 000000000000..59f8d9df9e1a
--- /dev/null
+++ b/drivers/md/dm-region-hash.c
@@ -0,0 +1,704 @@
1/*
2 * Copyright (C) 2003 Sistina Software Limited.
3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
4 *
5 * This file is released under the GPL.
6 */
7
8#include <linux/dm-dirty-log.h>
9#include <linux/dm-region-hash.h>
10
11#include <linux/ctype.h>
12#include <linux/init.h>
13#include <linux/module.h>
14#include <linux/vmalloc.h>
15
16#include "dm.h"
17#include "dm-bio-list.h"
18
19#define DM_MSG_PREFIX "region hash"
20
21/*-----------------------------------------------------------------
22 * Region hash
23 *
24 * The mirror splits itself up into discrete regions. Each
25 * region can be in one of three states: clean, dirty,
26 * nosync. There is no need to put clean regions in the hash.
27 *
28 * In addition to being present in the hash table a region _may_
29 * be present on one of three lists.
30 *
31 * clean_regions: Regions on this list have no io pending to
32 * them, they are in sync, we are no longer interested in them,
33 * they are dull. dm_rh_update_states() will remove them from the
34 * hash table.
35 *
36 * quiesced_regions: These regions have been spun down, ready
37 * for recovery. rh_recovery_start() will remove regions from
38 * this list and hand them to kmirrord, which will schedule the
39 * recovery io with kcopyd.
40 *
41 * recovered_regions: Regions that kcopyd has successfully
42 * recovered. dm_rh_update_states() will now schedule any delayed
43 * io, up the recovery_count, and remove the region from the
44 * hash.
45 *
46 * There are 2 locks:
47 * A rw spin lock 'hash_lock' protects just the hash table,
48 * this is never held in write mode from interrupt context,
49 * which I believe means that we only have to disable irqs when
50 * doing a write lock.
51 *
52 * An ordinary spin lock 'region_lock' that protects the three
53 * lists in the region_hash, with the 'state', 'list' and
54 * 'delayed_bios' fields of the regions. This is used from irq
55 * context, so all other uses will have to suspend local irqs.
56 *---------------------------------------------------------------*/
57struct dm_region_hash {
58 uint32_t region_size;
59 unsigned region_shift;
60
61 /* holds persistent region state */
62 struct dm_dirty_log *log;
63
64 /* hash table */
65 rwlock_t hash_lock;
66 mempool_t *region_pool;
67 unsigned mask;
68 unsigned nr_buckets;
69 unsigned prime;
70 unsigned shift;
71 struct list_head *buckets;
72
73 unsigned max_recovery; /* Max # of regions to recover in parallel */
74
75 spinlock_t region_lock;
76 atomic_t recovery_in_flight;
77 struct semaphore recovery_count;
78 struct list_head clean_regions;
79 struct list_head quiesced_regions;
80 struct list_head recovered_regions;
81 struct list_head failed_recovered_regions;
82
83 void *context;
84 sector_t target_begin;
85
86 /* Callback function to schedule bios writes */
87 void (*dispatch_bios)(void *context, struct bio_list *bios);
88
89 /* Callback function to wakeup callers worker thread. */
90 void (*wakeup_workers)(void *context);
91
92 /* Callback function to wakeup callers recovery waiters. */
93 void (*wakeup_all_recovery_waiters)(void *context);
94};
95
96struct dm_region {
97 struct dm_region_hash *rh; /* FIXME: can we get rid of this ? */
98 region_t key;
99 int state;
100
101 struct list_head hash_list;
102 struct list_head list;
103
104 atomic_t pending;
105 struct bio_list delayed_bios;
106};
107
108/*
109 * Conversion fns
110 */
111static region_t dm_rh_sector_to_region(struct dm_region_hash *rh, sector_t sector)
112{
113 return sector >> rh->region_shift;
114}
115
116sector_t dm_rh_region_to_sector(struct dm_region_hash *rh, region_t region)
117{
118 return region << rh->region_shift;
119}
120EXPORT_SYMBOL_GPL(dm_rh_region_to_sector);
121
122region_t dm_rh_bio_to_region(struct dm_region_hash *rh, struct bio *bio)
123{
124 return dm_rh_sector_to_region(rh, bio->bi_sector - rh->target_begin);
125}
126EXPORT_SYMBOL_GPL(dm_rh_bio_to_region);
127
128void *dm_rh_region_context(struct dm_region *reg)
129{
130 return reg->rh->context;
131}
132EXPORT_SYMBOL_GPL(dm_rh_region_context);
133
134region_t dm_rh_get_region_key(struct dm_region *reg)
135{
136 return reg->key;
137}
138EXPORT_SYMBOL_GPL(dm_rh_get_region_key);
139
140sector_t dm_rh_get_region_size(struct dm_region_hash *rh)
141{
142 return rh->region_size;
143}
144EXPORT_SYMBOL_GPL(dm_rh_get_region_size);
145
146/*
147 * FIXME: shall we pass in a structure instead of all these args to
148 * dm_region_hash_create()????
149 */
150#define RH_HASH_MULT 2654435387U
151#define RH_HASH_SHIFT 12
152
153#define MIN_REGIONS 64
154struct dm_region_hash *dm_region_hash_create(
155 void *context, void (*dispatch_bios)(void *context,
156 struct bio_list *bios),
157 void (*wakeup_workers)(void *context),
158 void (*wakeup_all_recovery_waiters)(void *context),
159 sector_t target_begin, unsigned max_recovery,
160 struct dm_dirty_log *log, uint32_t region_size,
161 region_t nr_regions)
162{
163 struct dm_region_hash *rh;
164 unsigned nr_buckets, max_buckets;
165 size_t i;
166
167 /*
168 * Calculate a suitable number of buckets for our hash
169 * table.
170 */
171 max_buckets = nr_regions >> 6;
172 for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1)
173 ;
174 nr_buckets >>= 1;
175
176 rh = kmalloc(sizeof(*rh), GFP_KERNEL);
177 if (!rh) {
178 DMERR("unable to allocate region hash memory");
179 return ERR_PTR(-ENOMEM);
180 }
181
182 rh->context = context;
183 rh->dispatch_bios = dispatch_bios;
184 rh->wakeup_workers = wakeup_workers;
185 rh->wakeup_all_recovery_waiters = wakeup_all_recovery_waiters;
186 rh->target_begin = target_begin;
187 rh->max_recovery = max_recovery;
188 rh->log = log;
189 rh->region_size = region_size;
190 rh->region_shift = ffs(region_size) - 1;
191 rwlock_init(&rh->hash_lock);
192 rh->mask = nr_buckets - 1;
193 rh->nr_buckets = nr_buckets;
194
195 rh->shift = RH_HASH_SHIFT;
196 rh->prime = RH_HASH_MULT;
197
198 rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets));
199 if (!rh->buckets) {
200 DMERR("unable to allocate region hash bucket memory");
201 kfree(rh);
202 return ERR_PTR(-ENOMEM);
203 }
204
205 for (i = 0; i < nr_buckets; i++)
206 INIT_LIST_HEAD(rh->buckets + i);
207
208 spin_lock_init(&rh->region_lock);
209 sema_init(&rh->recovery_count, 0);
210 atomic_set(&rh->recovery_in_flight, 0);
211 INIT_LIST_HEAD(&rh->clean_regions);
212 INIT_LIST_HEAD(&rh->quiesced_regions);
213 INIT_LIST_HEAD(&rh->recovered_regions);
214 INIT_LIST_HEAD(&rh->failed_recovered_regions);
215
216 rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS,
217 sizeof(struct dm_region));
218 if (!rh->region_pool) {
219 vfree(rh->buckets);
220 kfree(rh);
221 rh = ERR_PTR(-ENOMEM);
222 }
223
224 return rh;
225}
226EXPORT_SYMBOL_GPL(dm_region_hash_create);
227
228void dm_region_hash_destroy(struct dm_region_hash *rh)
229{
230 unsigned h;
231 struct dm_region *reg, *nreg;
232
233 BUG_ON(!list_empty(&rh->quiesced_regions));
234 for (h = 0; h < rh->nr_buckets; h++) {
235 list_for_each_entry_safe(reg, nreg, rh->buckets + h,
236 hash_list) {
237 BUG_ON(atomic_read(&reg->pending));
238 mempool_free(reg, rh->region_pool);
239 }
240 }
241
242 if (rh->log)
243 dm_dirty_log_destroy(rh->log);
244
245 if (rh->region_pool)
246 mempool_destroy(rh->region_pool);
247
248 vfree(rh->buckets);
249 kfree(rh);
250}
251EXPORT_SYMBOL_GPL(dm_region_hash_destroy);
252
253struct dm_dirty_log *dm_rh_dirty_log(struct dm_region_hash *rh)
254{
255 return rh->log;
256}
257EXPORT_SYMBOL_GPL(dm_rh_dirty_log);
258
259static unsigned rh_hash(struct dm_region_hash *rh, region_t region)
260{
261 return (unsigned) ((region * rh->prime) >> rh->shift) & rh->mask;
262}
263
264static struct dm_region *__rh_lookup(struct dm_region_hash *rh, region_t region)
265{
266 struct dm_region *reg;
267 struct list_head *bucket = rh->buckets + rh_hash(rh, region);
268
269 list_for_each_entry(reg, bucket, hash_list)
270 if (reg->key == region)
271 return reg;
272
273 return NULL;
274}
275
276static void __rh_insert(struct dm_region_hash *rh, struct dm_region *reg)
277{
278 list_add(&reg->hash_list, rh->buckets + rh_hash(rh, reg->key));
279}
280
281static struct dm_region *__rh_alloc(struct dm_region_hash *rh, region_t region)
282{
283 struct dm_region *reg, *nreg;
284
285 nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC);
286 if (unlikely(!nreg))
287 nreg = kmalloc(sizeof(*nreg), GFP_NOIO);
288
289 nreg->state = rh->log->type->in_sync(rh->log, region, 1) ?
290 DM_RH_CLEAN : DM_RH_NOSYNC;
291 nreg->rh = rh;
292 nreg->key = region;
293 INIT_LIST_HEAD(&nreg->list);
294 atomic_set(&nreg->pending, 0);
295 bio_list_init(&nreg->delayed_bios);
296
297 write_lock_irq(&rh->hash_lock);
298 reg = __rh_lookup(rh, region);
299 if (reg)
300 /* We lost the race. */
301 mempool_free(nreg, rh->region_pool);
302 else {
303 __rh_insert(rh, nreg);
304 if (nreg->state == DM_RH_CLEAN) {
305 spin_lock(&rh->region_lock);
306 list_add(&nreg->list, &rh->clean_regions);
307 spin_unlock(&rh->region_lock);
308 }
309
310 reg = nreg;
311 }
312 write_unlock_irq(&rh->hash_lock);
313
314 return reg;
315}
316
317static struct dm_region *__rh_find(struct dm_region_hash *rh, region_t region)
318{
319 struct dm_region *reg;
320
321 reg = __rh_lookup(rh, region);
322 if (!reg) {
323 read_unlock(&rh->hash_lock);
324 reg = __rh_alloc(rh, region);
325 read_lock(&rh->hash_lock);
326 }
327
328 return reg;
329}
330
331int dm_rh_get_state(struct dm_region_hash *rh, region_t region, int may_block)
332{
333 int r;
334 struct dm_region *reg;
335
336 read_lock(&rh->hash_lock);
337 reg = __rh_lookup(rh, region);
338 read_unlock(&rh->hash_lock);
339
340 if (reg)
341 return reg->state;
342
343 /*
344 * The region wasn't in the hash, so we fall back to the
345 * dirty log.
346 */
347 r = rh->log->type->in_sync(rh->log, region, may_block);
348
349 /*
350 * Any error from the dirty log (eg. -EWOULDBLOCK) gets
351 * taken as a DM_RH_NOSYNC
352 */
353 return r == 1 ? DM_RH_CLEAN : DM_RH_NOSYNC;
354}
355EXPORT_SYMBOL_GPL(dm_rh_get_state);
356
357static void complete_resync_work(struct dm_region *reg, int success)
358{
359 struct dm_region_hash *rh = reg->rh;
360
361 rh->log->type->set_region_sync(rh->log, reg->key, success);
362
363 /*
364 * Dispatch the bios before we call 'wake_up_all'.
365 * This is important because if we are suspending,
366 * we want to know that recovery is complete and
367 * the work queue is flushed. If we wake_up_all
368 * before we dispatch_bios (queue bios and call wake()),
369 * then we risk suspending before the work queue
370 * has been properly flushed.
371 */
372 rh->dispatch_bios(rh->context, &reg->delayed_bios);
373 if (atomic_dec_and_test(&rh->recovery_in_flight))
374 rh->wakeup_all_recovery_waiters(rh->context);
375 up(&rh->recovery_count);
376}
377
378/* dm_rh_mark_nosync
379 * @ms
380 * @bio
381 * @done
382 * @error
383 *
384 * The bio was written on some mirror(s) but failed on other mirror(s).
385 * We can successfully endio the bio but should avoid the region being
386 * marked clean by setting the state DM_RH_NOSYNC.
387 *
388 * This function is _not_ safe in interrupt context!
389 */
390void dm_rh_mark_nosync(struct dm_region_hash *rh,
391 struct bio *bio, unsigned done, int error)
392{
393 unsigned long flags;
394 struct dm_dirty_log *log = rh->log;
395 struct dm_region *reg;
396 region_t region = dm_rh_bio_to_region(rh, bio);
397 int recovering = 0;
398
399 /* We must inform the log that the sync count has changed. */
400 log->type->set_region_sync(log, region, 0);
401
402 read_lock(&rh->hash_lock);
403 reg = __rh_find(rh, region);
404 read_unlock(&rh->hash_lock);
405
406 /* region hash entry should exist because write was in-flight */
407 BUG_ON(!reg);
408 BUG_ON(!list_empty(&reg->list));
409
410 spin_lock_irqsave(&rh->region_lock, flags);
411 /*
412 * Possible cases:
413 * 1) DM_RH_DIRTY
414 * 2) DM_RH_NOSYNC: was dirty, other preceeding writes failed
415 * 3) DM_RH_RECOVERING: flushing pending writes
416 * Either case, the region should have not been connected to list.
417 */
418 recovering = (reg->state == DM_RH_RECOVERING);
419 reg->state = DM_RH_NOSYNC;
420 BUG_ON(!list_empty(&reg->list));
421 spin_unlock_irqrestore(&rh->region_lock, flags);
422
423 bio_endio(bio, error);
424 if (recovering)
425 complete_resync_work(reg, 0);
426}
427EXPORT_SYMBOL_GPL(dm_rh_mark_nosync);
428
429void dm_rh_update_states(struct dm_region_hash *rh, int errors_handled)
430{
431 struct dm_region *reg, *next;
432
433 LIST_HEAD(clean);
434 LIST_HEAD(recovered);
435 LIST_HEAD(failed_recovered);
436
437 /*
438 * Quickly grab the lists.
439 */
440 write_lock_irq(&rh->hash_lock);
441 spin_lock(&rh->region_lock);
442 if (!list_empty(&rh->clean_regions)) {
443 list_splice_init(&rh->clean_regions, &clean);
444
445 list_for_each_entry(reg, &clean, list)
446 list_del(&reg->hash_list);
447 }
448
449 if (!list_empty(&rh->recovered_regions)) {
450 list_splice_init(&rh->recovered_regions, &recovered);
451
452 list_for_each_entry(reg, &recovered, list)
453 list_del(&reg->hash_list);
454 }
455
456 if (!list_empty(&rh->failed_recovered_regions)) {
457 list_splice_init(&rh->failed_recovered_regions,
458 &failed_recovered);
459
460 list_for_each_entry(reg, &failed_recovered, list)
461 list_del(&reg->hash_list);
462 }
463
464 spin_unlock(&rh->region_lock);
465 write_unlock_irq(&rh->hash_lock);
466
467 /*
468 * All the regions on the recovered and clean lists have
469 * now been pulled out of the system, so no need to do
470 * any more locking.
471 */
472 list_for_each_entry_safe(reg, next, &recovered, list) {
473 rh->log->type->clear_region(rh->log, reg->key);
474 complete_resync_work(reg, 1);
475 mempool_free(reg, rh->region_pool);
476 }
477
478 list_for_each_entry_safe(reg, next, &failed_recovered, list) {
479 complete_resync_work(reg, errors_handled ? 0 : 1);
480 mempool_free(reg, rh->region_pool);
481 }
482
483 list_for_each_entry_safe(reg, next, &clean, list) {
484 rh->log->type->clear_region(rh->log, reg->key);
485 mempool_free(reg, rh->region_pool);
486 }
487
488 rh->log->type->flush(rh->log);
489}
490EXPORT_SYMBOL_GPL(dm_rh_update_states);
491
492static void rh_inc(struct dm_region_hash *rh, region_t region)
493{
494 struct dm_region *reg;
495
496 read_lock(&rh->hash_lock);
497 reg = __rh_find(rh, region);
498
499 spin_lock_irq(&rh->region_lock);
500 atomic_inc(&reg->pending);
501
502 if (reg->state == DM_RH_CLEAN) {
503 reg->state = DM_RH_DIRTY;
504 list_del_init(&reg->list); /* take off the clean list */
505 spin_unlock_irq(&rh->region_lock);
506
507 rh->log->type->mark_region(rh->log, reg->key);
508 } else
509 spin_unlock_irq(&rh->region_lock);
510
511
512 read_unlock(&rh->hash_lock);
513}
514
515void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios)
516{
517 struct bio *bio;
518
519 for (bio = bios->head; bio; bio = bio->bi_next)
520 rh_inc(rh, dm_rh_bio_to_region(rh, bio));
521}
522EXPORT_SYMBOL_GPL(dm_rh_inc_pending);
523
524void dm_rh_dec(struct dm_region_hash *rh, region_t region)
525{
526 unsigned long flags;
527 struct dm_region *reg;
528 int should_wake = 0;
529
530 read_lock(&rh->hash_lock);
531 reg = __rh_lookup(rh, region);
532 read_unlock(&rh->hash_lock);
533
534 spin_lock_irqsave(&rh->region_lock, flags);
535 if (atomic_dec_and_test(&reg->pending)) {
536 /*
537 * There is no pending I/O for this region.
538 * We can move the region to corresponding list for next action.
539 * At this point, the region is not yet connected to any list.
540 *
541 * If the state is DM_RH_NOSYNC, the region should be kept off
542 * from clean list.
543 * The hash entry for DM_RH_NOSYNC will remain in memory
544 * until the region is recovered or the map is reloaded.
545 */
546
547 /* do nothing for DM_RH_NOSYNC */
548 if (reg->state == DM_RH_RECOVERING) {
549 list_add_tail(&reg->list, &rh->quiesced_regions);
550 } else if (reg->state == DM_RH_DIRTY) {
551 reg->state = DM_RH_CLEAN;
552 list_add(&reg->list, &rh->clean_regions);
553 }
554 should_wake = 1;
555 }
556 spin_unlock_irqrestore(&rh->region_lock, flags);
557
558 if (should_wake)
559 rh->wakeup_workers(rh->context);
560}
561EXPORT_SYMBOL_GPL(dm_rh_dec);
562
563/*
564 * Starts quiescing a region in preparation for recovery.
565 */
566static int __rh_recovery_prepare(struct dm_region_hash *rh)
567{
568 int r;
569 region_t region;
570 struct dm_region *reg;
571
572 /*
573 * Ask the dirty log what's next.
574 */
575 r = rh->log->type->get_resync_work(rh->log, &region);
576 if (r <= 0)
577 return r;
578
579 /*
580 * Get this region, and start it quiescing by setting the
581 * recovering flag.
582 */
583 read_lock(&rh->hash_lock);
584 reg = __rh_find(rh, region);
585 read_unlock(&rh->hash_lock);
586
587 spin_lock_irq(&rh->region_lock);
588 reg->state = DM_RH_RECOVERING;
589
590 /* Already quiesced ? */
591 if (atomic_read(&reg->pending))
592 list_del_init(&reg->list);
593 else
594 list_move(&reg->list, &rh->quiesced_regions);
595
596 spin_unlock_irq(&rh->region_lock);
597
598 return 1;
599}
600
601void dm_rh_recovery_prepare(struct dm_region_hash *rh)
602{
603 /* Extra reference to avoid race with dm_rh_stop_recovery */
604 atomic_inc(&rh->recovery_in_flight);
605
606 while (!down_trylock(&rh->recovery_count)) {
607 atomic_inc(&rh->recovery_in_flight);
608 if (__rh_recovery_prepare(rh) <= 0) {
609 atomic_dec(&rh->recovery_in_flight);
610 up(&rh->recovery_count);
611 break;
612 }
613 }
614
615 /* Drop the extra reference */
616 if (atomic_dec_and_test(&rh->recovery_in_flight))
617 rh->wakeup_all_recovery_waiters(rh->context);
618}
619EXPORT_SYMBOL_GPL(dm_rh_recovery_prepare);
620
621/*
622 * Returns any quiesced regions.
623 */
624struct dm_region *dm_rh_recovery_start(struct dm_region_hash *rh)
625{
626 struct dm_region *reg = NULL;
627
628 spin_lock_irq(&rh->region_lock);
629 if (!list_empty(&rh->quiesced_regions)) {
630 reg = list_entry(rh->quiesced_regions.next,
631 struct dm_region, list);
632 list_del_init(&reg->list); /* remove from the quiesced list */
633 }
634 spin_unlock_irq(&rh->region_lock);
635
636 return reg;
637}
638EXPORT_SYMBOL_GPL(dm_rh_recovery_start);
639
640void dm_rh_recovery_end(struct dm_region *reg, int success)
641{
642 struct dm_region_hash *rh = reg->rh;
643
644 spin_lock_irq(&rh->region_lock);
645 if (success)
646 list_add(&reg->list, &reg->rh->recovered_regions);
647 else {
648 reg->state = DM_RH_NOSYNC;
649 list_add(&reg->list, &reg->rh->failed_recovered_regions);
650 }
651 spin_unlock_irq(&rh->region_lock);
652
653 rh->wakeup_workers(rh->context);
654}
655EXPORT_SYMBOL_GPL(dm_rh_recovery_end);
656
657/* Return recovery in flight count. */
658int dm_rh_recovery_in_flight(struct dm_region_hash *rh)
659{
660 return atomic_read(&rh->recovery_in_flight);
661}
662EXPORT_SYMBOL_GPL(dm_rh_recovery_in_flight);
663
664int dm_rh_flush(struct dm_region_hash *rh)
665{
666 return rh->log->type->flush(rh->log);
667}
668EXPORT_SYMBOL_GPL(dm_rh_flush);
669
670void dm_rh_delay(struct dm_region_hash *rh, struct bio *bio)
671{
672 struct dm_region *reg;
673
674 read_lock(&rh->hash_lock);
675 reg = __rh_find(rh, dm_rh_bio_to_region(rh, bio));
676 bio_list_add(&reg->delayed_bios, bio);
677 read_unlock(&rh->hash_lock);
678}
679EXPORT_SYMBOL_GPL(dm_rh_delay);
680
681void dm_rh_stop_recovery(struct dm_region_hash *rh)
682{
683 int i;
684
685 /* wait for any recovering regions */
686 for (i = 0; i < rh->max_recovery; i++)
687 down(&rh->recovery_count);
688}
689EXPORT_SYMBOL_GPL(dm_rh_stop_recovery);
690
691void dm_rh_start_recovery(struct dm_region_hash *rh)
692{
693 int i;
694
695 for (i = 0; i < rh->max_recovery; i++)
696 up(&rh->recovery_count);
697
698 rh->wakeup_workers(rh->context);
699}
700EXPORT_SYMBOL_GPL(dm_rh_start_recovery);
701
702MODULE_DESCRIPTION(DM_NAME " region hash");
703MODULE_AUTHOR("Joe Thornber/Heinz Mauelshagen <dm-devel@redhat.com>");
704MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c
index 391dfa2ad434..cdfbf65b28cb 100644
--- a/drivers/md/dm-round-robin.c
+++ b/drivers/md/dm-round-robin.c
@@ -9,7 +9,8 @@
9 * Round-robin path selector. 9 * Round-robin path selector.
10 */ 10 */
11 11
12#include "dm.h" 12#include <linux/device-mapper.h>
13
13#include "dm-path-selector.h" 14#include "dm-path-selector.h"
14 15
15#include <linux/slab.h> 16#include <linux/slab.h>
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 6e5528aecc98..b2d9d1ac28ad 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -600,7 +600,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
600 600
601 s->valid = 1; 601 s->valid = 1;
602 s->active = 0; 602 s->active = 0;
603 s->last_percent = 0;
604 init_rwsem(&s->lock); 603 init_rwsem(&s->lock);
605 spin_lock_init(&s->pe_lock); 604 spin_lock_init(&s->pe_lock);
606 s->ti = ti; 605 s->ti = ti;
@@ -824,8 +823,10 @@ static struct bio *put_pending_exception(struct dm_snap_pending_exception *pe)
824 * the bios for the original write to the origin. 823 * the bios for the original write to the origin.
825 */ 824 */
826 if (primary_pe && 825 if (primary_pe &&
827 atomic_dec_and_test(&primary_pe->ref_count)) 826 atomic_dec_and_test(&primary_pe->ref_count)) {
828 origin_bios = bio_list_get(&primary_pe->origin_bios); 827 origin_bios = bio_list_get(&primary_pe->origin_bios);
828 free_pending_exception(primary_pe);
829 }
829 830
830 /* 831 /*
831 * Free the pe if it's not linked to an origin write or if 832 * Free the pe if it's not linked to an origin write or if
@@ -834,12 +835,6 @@ static struct bio *put_pending_exception(struct dm_snap_pending_exception *pe)
834 if (!primary_pe || primary_pe != pe) 835 if (!primary_pe || primary_pe != pe)
835 free_pending_exception(pe); 836 free_pending_exception(pe);
836 837
837 /*
838 * Free the primary pe if nothing references it.
839 */
840 if (primary_pe && !atomic_read(&primary_pe->ref_count))
841 free_pending_exception(primary_pe);
842
843 return origin_bios; 838 return origin_bios;
844} 839}
845 840
diff --git a/drivers/md/dm-snap.h b/drivers/md/dm-snap.h
index 292c15609ae3..f07315fe2362 100644
--- a/drivers/md/dm-snap.h
+++ b/drivers/md/dm-snap.h
@@ -9,7 +9,7 @@
9#ifndef DM_SNAPSHOT_H 9#ifndef DM_SNAPSHOT_H
10#define DM_SNAPSHOT_H 10#define DM_SNAPSHOT_H
11 11
12#include "dm.h" 12#include <linux/device-mapper.h>
13#include "dm-bio-list.h" 13#include "dm-bio-list.h"
14#include <linux/blkdev.h> 14#include <linux/blkdev.h>
15#include <linux/workqueue.h> 15#include <linux/workqueue.h>
@@ -158,9 +158,6 @@ struct dm_snapshot {
158 /* Used for display of table */ 158 /* Used for display of table */
159 char type; 159 char type;
160 160
161 /* The last percentage we notified */
162 int last_percent;
163
164 mempool_t *pending_pool; 161 mempool_t *pending_pool;
165 162
166 struct exception_table pending; 163 struct exception_table pending;
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index b745d8ac625b..a2d068dbe9e2 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -4,7 +4,7 @@
4 * This file is released under the GPL. 4 * This file is released under the GPL.
5 */ 5 */
6 6
7#include "dm.h" 7#include <linux/device-mapper.h>
8 8
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/init.h> 10#include <linux/init.h>
@@ -60,8 +60,8 @@ static inline struct stripe_c *alloc_context(unsigned int stripes)
60{ 60{
61 size_t len; 61 size_t len;
62 62
63 if (array_too_big(sizeof(struct stripe_c), sizeof(struct stripe), 63 if (dm_array_too_big(sizeof(struct stripe_c), sizeof(struct stripe),
64 stripes)) 64 stripes))
65 return NULL; 65 return NULL;
66 66
67 len = sizeof(struct stripe_c) + (sizeof(struct stripe) * stripes); 67 len = sizeof(struct stripe_c) + (sizeof(struct stripe) * stripes);
diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c
index bdec206c404b..cdbf126ec106 100644
--- a/drivers/md/dm-zero.c
+++ b/drivers/md/dm-zero.c
@@ -4,7 +4,7 @@
4 * This file is released under the GPL. 4 * This file is released under the GPL.
5 */ 5 */
6 6
7#include "dm.h" 7#include <linux/device-mapper.h>
8 8
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/init.h> 10#include <linux/init.h>
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 327de03a5bdf..d1d0cd0f5750 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -76,7 +76,6 @@ union map_info *dm_get_mapinfo(struct bio *bio)
76 */ 76 */
77struct dm_wq_req { 77struct dm_wq_req {
78 enum { 78 enum {
79 DM_WQ_FLUSH_ALL,
80 DM_WQ_FLUSH_DEFERRED, 79 DM_WQ_FLUSH_DEFERRED,
81 } type; 80 } type;
82 struct work_struct work; 81 struct work_struct work;
@@ -151,40 +150,40 @@ static struct kmem_cache *_tio_cache;
151 150
152static int __init local_init(void) 151static int __init local_init(void)
153{ 152{
154 int r; 153 int r = -ENOMEM;
155 154
156 /* allocate a slab for the dm_ios */ 155 /* allocate a slab for the dm_ios */
157 _io_cache = KMEM_CACHE(dm_io, 0); 156 _io_cache = KMEM_CACHE(dm_io, 0);
158 if (!_io_cache) 157 if (!_io_cache)
159 return -ENOMEM; 158 return r;
160 159
161 /* allocate a slab for the target ios */ 160 /* allocate a slab for the target ios */
162 _tio_cache = KMEM_CACHE(dm_target_io, 0); 161 _tio_cache = KMEM_CACHE(dm_target_io, 0);
163 if (!_tio_cache) { 162 if (!_tio_cache)
164 kmem_cache_destroy(_io_cache); 163 goto out_free_io_cache;
165 return -ENOMEM;
166 }
167 164
168 r = dm_uevent_init(); 165 r = dm_uevent_init();
169 if (r) { 166 if (r)
170 kmem_cache_destroy(_tio_cache); 167 goto out_free_tio_cache;
171 kmem_cache_destroy(_io_cache);
172 return r;
173 }
174 168
175 _major = major; 169 _major = major;
176 r = register_blkdev(_major, _name); 170 r = register_blkdev(_major, _name);
177 if (r < 0) { 171 if (r < 0)
178 kmem_cache_destroy(_tio_cache); 172 goto out_uevent_exit;
179 kmem_cache_destroy(_io_cache);
180 dm_uevent_exit();
181 return r;
182 }
183 173
184 if (!_major) 174 if (!_major)
185 _major = r; 175 _major = r;
186 176
187 return 0; 177 return 0;
178
179out_uevent_exit:
180 dm_uevent_exit();
181out_free_tio_cache:
182 kmem_cache_destroy(_tio_cache);
183out_free_io_cache:
184 kmem_cache_destroy(_io_cache);
185
186 return r;
188} 187}
189 188
190static void local_exit(void) 189static void local_exit(void)
@@ -669,6 +668,7 @@ static struct bio *split_bvec(struct bio *bio, sector_t sector,
669 clone->bi_size = to_bytes(len); 668 clone->bi_size = to_bytes(len);
670 clone->bi_io_vec->bv_offset = offset; 669 clone->bi_io_vec->bv_offset = offset;
671 clone->bi_io_vec->bv_len = clone->bi_size; 670 clone->bi_io_vec->bv_len = clone->bi_size;
671 clone->bi_flags |= 1 << BIO_CLONED;
672 672
673 return clone; 673 return clone;
674} 674}
@@ -1394,9 +1394,6 @@ static void dm_wq_work(struct work_struct *work)
1394 1394
1395 down_write(&md->io_lock); 1395 down_write(&md->io_lock);
1396 switch (req->type) { 1396 switch (req->type) {
1397 case DM_WQ_FLUSH_ALL:
1398 __merge_pushback_list(md);
1399 /* pass through */
1400 case DM_WQ_FLUSH_DEFERRED: 1397 case DM_WQ_FLUSH_DEFERRED:
1401 __flush_deferred_io(md); 1398 __flush_deferred_io(md);
1402 break; 1399 break;
@@ -1526,7 +1523,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
1526 if (!md->suspended_bdev) { 1523 if (!md->suspended_bdev) {
1527 DMWARN("bdget failed in dm_suspend"); 1524 DMWARN("bdget failed in dm_suspend");
1528 r = -ENOMEM; 1525 r = -ENOMEM;
1529 goto flush_and_out; 1526 goto out;
1530 } 1527 }
1531 1528
1532 /* 1529 /*
@@ -1577,14 +1574,6 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
1577 1574
1578 set_bit(DMF_SUSPENDED, &md->flags); 1575 set_bit(DMF_SUSPENDED, &md->flags);
1579 1576
1580flush_and_out:
1581 if (r && noflush)
1582 /*
1583 * Because there may be already I/Os in the pushback list,
1584 * flush them before return.
1585 */
1586 dm_queue_flush(md, DM_WQ_FLUSH_ALL, NULL);
1587
1588out: 1577out:
1589 if (r && md->suspended_bdev) { 1578 if (r && md->suspended_bdev) {
1590 bdput(md->suspended_bdev); 1579 bdput(md->suspended_bdev);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index cd189da2b2fa..0ade60cdef42 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -62,15 +62,6 @@ void dm_put_target_type(struct target_type *t);
62int dm_target_iterate(void (*iter_func)(struct target_type *tt, 62int dm_target_iterate(void (*iter_func)(struct target_type *tt,
63 void *param), void *param); 63 void *param), void *param);
64 64
65/*-----------------------------------------------------------------
66 * Useful inlines.
67 *---------------------------------------------------------------*/
68static inline int array_too_big(unsigned long fixed, unsigned long obj,
69 unsigned long num)
70{
71 return (num > (ULONG_MAX - fixed) / obj);
72}
73
74int dm_split_args(int *argc, char ***argvp, char *input); 65int dm_split_args(int *argc, char ***argvp, char *input);
75 66
76/* 67/*