aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/dm-raid1.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/dm-raid1.c')
-rw-r--r--drivers/md/dm-raid1.c232
1 files changed, 166 insertions, 66 deletions
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index cc9dc79b0784..ddda531723dc 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -35,6 +35,7 @@ static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped);
35 *---------------------------------------------------------------*/ 35 *---------------------------------------------------------------*/
36enum dm_raid1_error { 36enum dm_raid1_error {
37 DM_RAID1_WRITE_ERROR, 37 DM_RAID1_WRITE_ERROR,
38 DM_RAID1_FLUSH_ERROR,
38 DM_RAID1_SYNC_ERROR, 39 DM_RAID1_SYNC_ERROR,
39 DM_RAID1_READ_ERROR 40 DM_RAID1_READ_ERROR
40}; 41};
@@ -57,6 +58,7 @@ struct mirror_set {
57 struct bio_list reads; 58 struct bio_list reads;
58 struct bio_list writes; 59 struct bio_list writes;
59 struct bio_list failures; 60 struct bio_list failures;
61 struct bio_list holds; /* bios are waiting until suspend */
60 62
61 struct dm_region_hash *rh; 63 struct dm_region_hash *rh;
62 struct dm_kcopyd_client *kcopyd_client; 64 struct dm_kcopyd_client *kcopyd_client;
@@ -67,6 +69,7 @@ struct mirror_set {
67 region_t nr_regions; 69 region_t nr_regions;
68 int in_sync; 70 int in_sync;
69 int log_failure; 71 int log_failure;
72 int leg_failure;
70 atomic_t suspend; 73 atomic_t suspend;
71 74
72 atomic_t default_mirror; /* Default mirror */ 75 atomic_t default_mirror; /* Default mirror */
@@ -179,6 +182,17 @@ static void set_default_mirror(struct mirror *m)
179 atomic_set(&ms->default_mirror, m - m0); 182 atomic_set(&ms->default_mirror, m - m0);
180} 183}
181 184
185static struct mirror *get_valid_mirror(struct mirror_set *ms)
186{
187 struct mirror *m;
188
189 for (m = ms->mirror; m < ms->mirror + ms->nr_mirrors; m++)
190 if (!atomic_read(&m->error_count))
191 return m;
192
193 return NULL;
194}
195
182/* fail_mirror 196/* fail_mirror
183 * @m: mirror device to fail 197 * @m: mirror device to fail
184 * @error_type: one of the enum's, DM_RAID1_*_ERROR 198 * @error_type: one of the enum's, DM_RAID1_*_ERROR
@@ -198,6 +212,8 @@ static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type)
198 struct mirror_set *ms = m->ms; 212 struct mirror_set *ms = m->ms;
199 struct mirror *new; 213 struct mirror *new;
200 214
215 ms->leg_failure = 1;
216
201 /* 217 /*
202 * error_count is used for nothing more than a 218 * error_count is used for nothing more than a
203 * simple way to tell if a device has encountered 219 * simple way to tell if a device has encountered
@@ -224,19 +240,50 @@ static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type)
224 goto out; 240 goto out;
225 } 241 }
226 242
227 for (new = ms->mirror; new < ms->mirror + ms->nr_mirrors; new++) 243 new = get_valid_mirror(ms);
228 if (!atomic_read(&new->error_count)) { 244 if (new)
229 set_default_mirror(new); 245 set_default_mirror(new);
230 break; 246 else
231 }
232
233 if (unlikely(new == ms->mirror + ms->nr_mirrors))
234 DMWARN("All sides of mirror have failed."); 247 DMWARN("All sides of mirror have failed.");
235 248
236out: 249out:
237 schedule_work(&ms->trigger_event); 250 schedule_work(&ms->trigger_event);
238} 251}
239 252
253static int mirror_flush(struct dm_target *ti)
254{
255 struct mirror_set *ms = ti->private;
256 unsigned long error_bits;
257
258 unsigned int i;
259 struct dm_io_region io[ms->nr_mirrors];
260 struct mirror *m;
261 struct dm_io_request io_req = {
262 .bi_rw = WRITE_BARRIER,
263 .mem.type = DM_IO_KMEM,
264 .mem.ptr.bvec = NULL,
265 .client = ms->io_client,
266 };
267
268 for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++) {
269 io[i].bdev = m->dev->bdev;
270 io[i].sector = 0;
271 io[i].count = 0;
272 }
273
274 error_bits = -1;
275 dm_io(&io_req, ms->nr_mirrors, io, &error_bits);
276 if (unlikely(error_bits != 0)) {
277 for (i = 0; i < ms->nr_mirrors; i++)
278 if (test_bit(i, &error_bits))
279 fail_mirror(ms->mirror + i,
280 DM_RAID1_FLUSH_ERROR);
281 return -EIO;
282 }
283
284 return 0;
285}
286
240/*----------------------------------------------------------------- 287/*-----------------------------------------------------------------
241 * Recovery. 288 * Recovery.
242 * 289 *
@@ -396,6 +443,8 @@ static int mirror_available(struct mirror_set *ms, struct bio *bio)
396 */ 443 */
397static sector_t map_sector(struct mirror *m, struct bio *bio) 444static sector_t map_sector(struct mirror *m, struct bio *bio)
398{ 445{
446 if (unlikely(!bio->bi_size))
447 return 0;
399 return m->offset + (bio->bi_sector - m->ms->ti->begin); 448 return m->offset + (bio->bi_sector - m->ms->ti->begin);
400} 449}
401 450
@@ -413,6 +462,34 @@ static void map_region(struct dm_io_region *io, struct mirror *m,
413 io->count = bio->bi_size >> 9; 462 io->count = bio->bi_size >> 9;
414} 463}
415 464
465static void hold_bio(struct mirror_set *ms, struct bio *bio)
466{
467 /*
468 * Lock is required to avoid race condition during suspend
469 * process.
470 */
471 spin_lock_irq(&ms->lock);
472
473 if (atomic_read(&ms->suspend)) {
474 spin_unlock_irq(&ms->lock);
475
476 /*
477 * If device is suspended, complete the bio.
478 */
479 if (dm_noflush_suspending(ms->ti))
480 bio_endio(bio, DM_ENDIO_REQUEUE);
481 else
482 bio_endio(bio, -EIO);
483 return;
484 }
485
486 /*
487 * Hold bio until the suspend is complete.
488 */
489 bio_list_add(&ms->holds, bio);
490 spin_unlock_irq(&ms->lock);
491}
492
416/*----------------------------------------------------------------- 493/*-----------------------------------------------------------------
417 * Reads 494 * Reads
418 *---------------------------------------------------------------*/ 495 *---------------------------------------------------------------*/
@@ -511,7 +588,6 @@ static void write_callback(unsigned long error, void *context)
511 unsigned i, ret = 0; 588 unsigned i, ret = 0;
512 struct bio *bio = (struct bio *) context; 589 struct bio *bio = (struct bio *) context;
513 struct mirror_set *ms; 590 struct mirror_set *ms;
514 int uptodate = 0;
515 int should_wake = 0; 591 int should_wake = 0;
516 unsigned long flags; 592 unsigned long flags;
517 593
@@ -524,36 +600,27 @@ static void write_callback(unsigned long error, void *context)
524 * This way we handle both writes to SYNC and NOSYNC 600 * This way we handle both writes to SYNC and NOSYNC
525 * regions with the same code. 601 * regions with the same code.
526 */ 602 */
527 if (likely(!error)) 603 if (likely(!error)) {
528 goto out; 604 bio_endio(bio, ret);
605 return;
606 }
529 607
530 for (i = 0; i < ms->nr_mirrors; i++) 608 for (i = 0; i < ms->nr_mirrors; i++)
531 if (test_bit(i, &error)) 609 if (test_bit(i, &error))
532 fail_mirror(ms->mirror + i, DM_RAID1_WRITE_ERROR); 610 fail_mirror(ms->mirror + i, DM_RAID1_WRITE_ERROR);
533 else
534 uptodate = 1;
535 611
536 if (unlikely(!uptodate)) { 612 /*
537 DMERR("All replicated volumes dead, failing I/O"); 613 * Need to raise event. Since raising
538 /* None of the writes succeeded, fail the I/O. */ 614 * events can block, we need to do it in
539 ret = -EIO; 615 * the main thread.
540 } else if (errors_handled(ms)) { 616 */
541 /* 617 spin_lock_irqsave(&ms->lock, flags);
542 * Need to raise event. Since raising 618 if (!ms->failures.head)
543 * events can block, we need to do it in 619 should_wake = 1;
544 * the main thread. 620 bio_list_add(&ms->failures, bio);
545 */ 621 spin_unlock_irqrestore(&ms->lock, flags);
546 spin_lock_irqsave(&ms->lock, flags); 622 if (should_wake)
547 if (!ms->failures.head) 623 wakeup_mirrord(ms);
548 should_wake = 1;
549 bio_list_add(&ms->failures, bio);
550 spin_unlock_irqrestore(&ms->lock, flags);
551 if (should_wake)
552 wakeup_mirrord(ms);
553 return;
554 }
555out:
556 bio_endio(bio, ret);
557} 624}
558 625
559static void do_write(struct mirror_set *ms, struct bio *bio) 626static void do_write(struct mirror_set *ms, struct bio *bio)
@@ -562,7 +629,7 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
562 struct dm_io_region io[ms->nr_mirrors], *dest = io; 629 struct dm_io_region io[ms->nr_mirrors], *dest = io;
563 struct mirror *m; 630 struct mirror *m;
564 struct dm_io_request io_req = { 631 struct dm_io_request io_req = {
565 .bi_rw = WRITE, 632 .bi_rw = WRITE | (bio->bi_rw & WRITE_BARRIER),
566 .mem.type = DM_IO_BVEC, 633 .mem.type = DM_IO_BVEC,
567 .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx, 634 .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx,
568 .notify.fn = write_callback, 635 .notify.fn = write_callback,
@@ -603,6 +670,11 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
603 bio_list_init(&requeue); 670 bio_list_init(&requeue);
604 671
605 while ((bio = bio_list_pop(writes))) { 672 while ((bio = bio_list_pop(writes))) {
673 if (unlikely(bio_empty_barrier(bio))) {
674 bio_list_add(&sync, bio);
675 continue;
676 }
677
606 region = dm_rh_bio_to_region(ms->rh, bio); 678 region = dm_rh_bio_to_region(ms->rh, bio);
607 679
608 if (log->type->is_remote_recovering && 680 if (log->type->is_remote_recovering &&
@@ -659,7 +731,7 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
659 /* 731 /*
660 * Dispatch io. 732 * Dispatch io.
661 */ 733 */
662 if (unlikely(ms->log_failure)) { 734 if (unlikely(ms->log_failure) && errors_handled(ms)) {
663 spin_lock_irq(&ms->lock); 735 spin_lock_irq(&ms->lock);
664 bio_list_merge(&ms->failures, &sync); 736 bio_list_merge(&ms->failures, &sync);
665 spin_unlock_irq(&ms->lock); 737 spin_unlock_irq(&ms->lock);
@@ -672,8 +744,15 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
672 dm_rh_delay(ms->rh, bio); 744 dm_rh_delay(ms->rh, bio);
673 745
674 while ((bio = bio_list_pop(&nosync))) { 746 while ((bio = bio_list_pop(&nosync))) {
675 map_bio(get_default_mirror(ms), bio); 747 if (unlikely(ms->leg_failure) && errors_handled(ms)) {
676 generic_make_request(bio); 748 spin_lock_irq(&ms->lock);
749 bio_list_add(&ms->failures, bio);
750 spin_unlock_irq(&ms->lock);
751 wakeup_mirrord(ms);
752 } else {
753 map_bio(get_default_mirror(ms), bio);
754 generic_make_request(bio);
755 }
677 } 756 }
678} 757}
679 758
@@ -681,20 +760,12 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures)
681{ 760{
682 struct bio *bio; 761 struct bio *bio;
683 762
684 if (!failures->head) 763 if (likely(!failures->head))
685 return;
686
687 if (!ms->log_failure) {
688 while ((bio = bio_list_pop(failures))) {
689 ms->in_sync = 0;
690 dm_rh_mark_nosync(ms->rh, bio, bio->bi_size, 0);
691 }
692 return; 764 return;
693 }
694 765
695 /* 766 /*
696 * If the log has failed, unattempted writes are being 767 * If the log has failed, unattempted writes are being
697 * put on the failures list. We can't issue those writes 768 * put on the holds list. We can't issue those writes
698 * until a log has been marked, so we must store them. 769 * until a log has been marked, so we must store them.
699 * 770 *
700 * If a 'noflush' suspend is in progress, we can requeue 771 * If a 'noflush' suspend is in progress, we can requeue
@@ -709,23 +780,27 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures)
709 * for us to treat them the same and requeue them 780 * for us to treat them the same and requeue them
710 * as well. 781 * as well.
711 */ 782 */
712 if (dm_noflush_suspending(ms->ti)) { 783 while ((bio = bio_list_pop(failures))) {
713 while ((bio = bio_list_pop(failures))) 784 if (!ms->log_failure) {
714 bio_endio(bio, DM_ENDIO_REQUEUE); 785 ms->in_sync = 0;
715 return; 786 dm_rh_mark_nosync(ms->rh, bio);
716 } 787 }
717 788
718 if (atomic_read(&ms->suspend)) { 789 /*
719 while ((bio = bio_list_pop(failures))) 790 * If all the legs are dead, fail the I/O.
791 * If we have been told to handle errors, hold the bio
792 * and wait for userspace to deal with the problem.
793 * Otherwise pretend that the I/O succeeded. (This would
794 * be wrong if the failed leg returned after reboot and
795 * got replicated back to the good legs.)
796 */
797 if (!get_valid_mirror(ms))
720 bio_endio(bio, -EIO); 798 bio_endio(bio, -EIO);
721 return; 799 else if (errors_handled(ms))
800 hold_bio(ms, bio);
801 else
802 bio_endio(bio, 0);
722 } 803 }
723
724 spin_lock_irq(&ms->lock);
725 bio_list_merge(&ms->failures, failures);
726 spin_unlock_irq(&ms->lock);
727
728 delayed_wake(ms);
729} 804}
730 805
731static void trigger_event(struct work_struct *work) 806static void trigger_event(struct work_struct *work)
@@ -784,12 +859,17 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
784 } 859 }
785 860
786 spin_lock_init(&ms->lock); 861 spin_lock_init(&ms->lock);
862 bio_list_init(&ms->reads);
863 bio_list_init(&ms->writes);
864 bio_list_init(&ms->failures);
865 bio_list_init(&ms->holds);
787 866
788 ms->ti = ti; 867 ms->ti = ti;
789 ms->nr_mirrors = nr_mirrors; 868 ms->nr_mirrors = nr_mirrors;
790 ms->nr_regions = dm_sector_div_up(ti->len, region_size); 869 ms->nr_regions = dm_sector_div_up(ti->len, region_size);
791 ms->in_sync = 0; 870 ms->in_sync = 0;
792 ms->log_failure = 0; 871 ms->log_failure = 0;
872 ms->leg_failure = 0;
793 atomic_set(&ms->suspend, 0); 873 atomic_set(&ms->suspend, 0);
794 atomic_set(&ms->default_mirror, DEFAULT_MIRROR); 874 atomic_set(&ms->default_mirror, DEFAULT_MIRROR);
795 875
@@ -847,8 +927,7 @@ static int get_mirror(struct mirror_set *ms, struct dm_target *ti,
847 return -EINVAL; 927 return -EINVAL;
848 } 928 }
849 929
850 if (dm_get_device(ti, argv[0], offset, ti->len, 930 if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table),
851 dm_table_get_mode(ti->table),
852 &ms->mirror[mirror].dev)) { 931 &ms->mirror[mirror].dev)) {
853 ti->error = "Device lookup failure"; 932 ti->error = "Device lookup failure";
854 return -ENXIO; 933 return -ENXIO;
@@ -889,7 +968,8 @@ static struct dm_dirty_log *create_dirty_log(struct dm_target *ti,
889 return NULL; 968 return NULL;
890 } 969 }
891 970
892 dl = dm_dirty_log_create(argv[0], ti, param_count, argv + 2); 971 dl = dm_dirty_log_create(argv[0], ti, mirror_flush, param_count,
972 argv + 2);
893 if (!dl) { 973 if (!dl) {
894 ti->error = "Error creating mirror dirty log"; 974 ti->error = "Error creating mirror dirty log";
895 return NULL; 975 return NULL;
@@ -995,6 +1075,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
995 1075
996 ti->private = ms; 1076 ti->private = ms;
997 ti->split_io = dm_rh_get_region_size(ms->rh); 1077 ti->split_io = dm_rh_get_region_size(ms->rh);
1078 ti->num_flush_requests = 1;
998 1079
999 ms->kmirrord_wq = create_singlethread_workqueue("kmirrord"); 1080 ms->kmirrord_wq = create_singlethread_workqueue("kmirrord");
1000 if (!ms->kmirrord_wq) { 1081 if (!ms->kmirrord_wq) {
@@ -1122,7 +1203,8 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
1122 * We need to dec pending if this was a write. 1203 * We need to dec pending if this was a write.
1123 */ 1204 */
1124 if (rw == WRITE) { 1205 if (rw == WRITE) {
1125 dm_rh_dec(ms->rh, map_context->ll); 1206 if (likely(!bio_empty_barrier(bio)))
1207 dm_rh_dec(ms->rh, map_context->ll);
1126 return error; 1208 return error;
1127 } 1209 }
1128 1210
@@ -1180,9 +1262,26 @@ static void mirror_presuspend(struct dm_target *ti)
1180 struct mirror_set *ms = (struct mirror_set *) ti->private; 1262 struct mirror_set *ms = (struct mirror_set *) ti->private;
1181 struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); 1263 struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
1182 1264
1265 struct bio_list holds;
1266 struct bio *bio;
1267
1183 atomic_set(&ms->suspend, 1); 1268 atomic_set(&ms->suspend, 1);
1184 1269
1185 /* 1270 /*
1271 * Process bios in the hold list to start recovery waiting
1272 * for bios in the hold list. After the process, no bio has
1273 * a chance to be added in the hold list because ms->suspend
1274 * is set.
1275 */
1276 spin_lock_irq(&ms->lock);
1277 holds = ms->holds;
1278 bio_list_init(&ms->holds);
1279 spin_unlock_irq(&ms->lock);
1280
1281 while ((bio = bio_list_pop(&holds)))
1282 hold_bio(ms, bio);
1283
1284 /*
1186 * We must finish up all the work that we've 1285 * We must finish up all the work that we've
1187 * generated (i.e. recovery work). 1286 * generated (i.e. recovery work).
1188 */ 1287 */
@@ -1244,7 +1343,8 @@ static char device_status_char(struct mirror *m)
1244 if (!atomic_read(&(m->error_count))) 1343 if (!atomic_read(&(m->error_count)))
1245 return 'A'; 1344 return 'A';
1246 1345
1247 return (test_bit(DM_RAID1_WRITE_ERROR, &(m->error_type))) ? 'D' : 1346 return (test_bit(DM_RAID1_FLUSH_ERROR, &(m->error_type))) ? 'F' :
1347 (test_bit(DM_RAID1_WRITE_ERROR, &(m->error_type))) ? 'D' :
1248 (test_bit(DM_RAID1_SYNC_ERROR, &(m->error_type))) ? 'S' : 1348 (test_bit(DM_RAID1_SYNC_ERROR, &(m->error_type))) ? 'S' :
1249 (test_bit(DM_RAID1_READ_ERROR, &(m->error_type))) ? 'R' : 'U'; 1349 (test_bit(DM_RAID1_READ_ERROR, &(m->error_type))) ? 'R' : 'U';
1250} 1350}