diff options
Diffstat (limited to 'drivers/md/dm-raid1.c')
-rw-r--r-- | drivers/md/dm-raid1.c | 219 |
1 files changed, 156 insertions, 63 deletions
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index cc9dc79b0784..ad779bd13aec 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c | |||
@@ -35,6 +35,7 @@ static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped); | |||
35 | *---------------------------------------------------------------*/ | 35 | *---------------------------------------------------------------*/ |
36 | enum dm_raid1_error { | 36 | enum dm_raid1_error { |
37 | DM_RAID1_WRITE_ERROR, | 37 | DM_RAID1_WRITE_ERROR, |
38 | DM_RAID1_FLUSH_ERROR, | ||
38 | DM_RAID1_SYNC_ERROR, | 39 | DM_RAID1_SYNC_ERROR, |
39 | DM_RAID1_READ_ERROR | 40 | DM_RAID1_READ_ERROR |
40 | }; | 41 | }; |
@@ -57,6 +58,7 @@ struct mirror_set { | |||
57 | struct bio_list reads; | 58 | struct bio_list reads; |
58 | struct bio_list writes; | 59 | struct bio_list writes; |
59 | struct bio_list failures; | 60 | struct bio_list failures; |
61 | struct bio_list holds; /* bios are waiting until suspend */ | ||
60 | 62 | ||
61 | struct dm_region_hash *rh; | 63 | struct dm_region_hash *rh; |
62 | struct dm_kcopyd_client *kcopyd_client; | 64 | struct dm_kcopyd_client *kcopyd_client; |
@@ -67,6 +69,7 @@ struct mirror_set { | |||
67 | region_t nr_regions; | 69 | region_t nr_regions; |
68 | int in_sync; | 70 | int in_sync; |
69 | int log_failure; | 71 | int log_failure; |
72 | int leg_failure; | ||
70 | atomic_t suspend; | 73 | atomic_t suspend; |
71 | 74 | ||
72 | atomic_t default_mirror; /* Default mirror */ | 75 | atomic_t default_mirror; /* Default mirror */ |
@@ -179,6 +182,17 @@ static void set_default_mirror(struct mirror *m) | |||
179 | atomic_set(&ms->default_mirror, m - m0); | 182 | atomic_set(&ms->default_mirror, m - m0); |
180 | } | 183 | } |
181 | 184 | ||
185 | static struct mirror *get_valid_mirror(struct mirror_set *ms) | ||
186 | { | ||
187 | struct mirror *m; | ||
188 | |||
189 | for (m = ms->mirror; m < ms->mirror + ms->nr_mirrors; m++) | ||
190 | if (!atomic_read(&m->error_count)) | ||
191 | return m; | ||
192 | |||
193 | return NULL; | ||
194 | } | ||
195 | |||
182 | /* fail_mirror | 196 | /* fail_mirror |
183 | * @m: mirror device to fail | 197 | * @m: mirror device to fail |
184 | * @error_type: one of the enum's, DM_RAID1_*_ERROR | 198 | * @error_type: one of the enum's, DM_RAID1_*_ERROR |
@@ -198,6 +212,8 @@ static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type) | |||
198 | struct mirror_set *ms = m->ms; | 212 | struct mirror_set *ms = m->ms; |
199 | struct mirror *new; | 213 | struct mirror *new; |
200 | 214 | ||
215 | ms->leg_failure = 1; | ||
216 | |||
201 | /* | 217 | /* |
202 | * error_count is used for nothing more than a | 218 | * error_count is used for nothing more than a |
203 | * simple way to tell if a device has encountered | 219 | * simple way to tell if a device has encountered |
@@ -224,19 +240,50 @@ static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type) | |||
224 | goto out; | 240 | goto out; |
225 | } | 241 | } |
226 | 242 | ||
227 | for (new = ms->mirror; new < ms->mirror + ms->nr_mirrors; new++) | 243 | new = get_valid_mirror(ms); |
228 | if (!atomic_read(&new->error_count)) { | 244 | if (new) |
229 | set_default_mirror(new); | 245 | set_default_mirror(new); |
230 | break; | 246 | else |
231 | } | ||
232 | |||
233 | if (unlikely(new == ms->mirror + ms->nr_mirrors)) | ||
234 | DMWARN("All sides of mirror have failed."); | 247 | DMWARN("All sides of mirror have failed."); |
235 | 248 | ||
236 | out: | 249 | out: |
237 | schedule_work(&ms->trigger_event); | 250 | schedule_work(&ms->trigger_event); |
238 | } | 251 | } |
239 | 252 | ||
253 | static int mirror_flush(struct dm_target *ti) | ||
254 | { | ||
255 | struct mirror_set *ms = ti->private; | ||
256 | unsigned long error_bits; | ||
257 | |||
258 | unsigned int i; | ||
259 | struct dm_io_region io[ms->nr_mirrors]; | ||
260 | struct mirror *m; | ||
261 | struct dm_io_request io_req = { | ||
262 | .bi_rw = WRITE_BARRIER, | ||
263 | .mem.type = DM_IO_KMEM, | ||
264 | .mem.ptr.bvec = NULL, | ||
265 | .client = ms->io_client, | ||
266 | }; | ||
267 | |||
268 | for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++) { | ||
269 | io[i].bdev = m->dev->bdev; | ||
270 | io[i].sector = 0; | ||
271 | io[i].count = 0; | ||
272 | } | ||
273 | |||
274 | error_bits = -1; | ||
275 | dm_io(&io_req, ms->nr_mirrors, io, &error_bits); | ||
276 | if (unlikely(error_bits != 0)) { | ||
277 | for (i = 0; i < ms->nr_mirrors; i++) | ||
278 | if (test_bit(i, &error_bits)) | ||
279 | fail_mirror(ms->mirror + i, | ||
280 | DM_RAID1_FLUSH_ERROR); | ||
281 | return -EIO; | ||
282 | } | ||
283 | |||
284 | return 0; | ||
285 | } | ||
286 | |||
240 | /*----------------------------------------------------------------- | 287 | /*----------------------------------------------------------------- |
241 | * Recovery. | 288 | * Recovery. |
242 | * | 289 | * |
@@ -396,6 +443,8 @@ static int mirror_available(struct mirror_set *ms, struct bio *bio) | |||
396 | */ | 443 | */ |
397 | static sector_t map_sector(struct mirror *m, struct bio *bio) | 444 | static sector_t map_sector(struct mirror *m, struct bio *bio) |
398 | { | 445 | { |
446 | if (unlikely(!bio->bi_size)) | ||
447 | return 0; | ||
399 | return m->offset + (bio->bi_sector - m->ms->ti->begin); | 448 | return m->offset + (bio->bi_sector - m->ms->ti->begin); |
400 | } | 449 | } |
401 | 450 | ||
@@ -413,6 +462,27 @@ static void map_region(struct dm_io_region *io, struct mirror *m, | |||
413 | io->count = bio->bi_size >> 9; | 462 | io->count = bio->bi_size >> 9; |
414 | } | 463 | } |
415 | 464 | ||
465 | static void hold_bio(struct mirror_set *ms, struct bio *bio) | ||
466 | { | ||
467 | /* | ||
468 | * If device is suspended, complete the bio. | ||
469 | */ | ||
470 | if (atomic_read(&ms->suspend)) { | ||
471 | if (dm_noflush_suspending(ms->ti)) | ||
472 | bio_endio(bio, DM_ENDIO_REQUEUE); | ||
473 | else | ||
474 | bio_endio(bio, -EIO); | ||
475 | return; | ||
476 | } | ||
477 | |||
478 | /* | ||
479 | * Hold bio until the suspend is complete. | ||
480 | */ | ||
481 | spin_lock_irq(&ms->lock); | ||
482 | bio_list_add(&ms->holds, bio); | ||
483 | spin_unlock_irq(&ms->lock); | ||
484 | } | ||
485 | |||
416 | /*----------------------------------------------------------------- | 486 | /*----------------------------------------------------------------- |
417 | * Reads | 487 | * Reads |
418 | *---------------------------------------------------------------*/ | 488 | *---------------------------------------------------------------*/ |
@@ -511,7 +581,6 @@ static void write_callback(unsigned long error, void *context) | |||
511 | unsigned i, ret = 0; | 581 | unsigned i, ret = 0; |
512 | struct bio *bio = (struct bio *) context; | 582 | struct bio *bio = (struct bio *) context; |
513 | struct mirror_set *ms; | 583 | struct mirror_set *ms; |
514 | int uptodate = 0; | ||
515 | int should_wake = 0; | 584 | int should_wake = 0; |
516 | unsigned long flags; | 585 | unsigned long flags; |
517 | 586 | ||
@@ -524,36 +593,27 @@ static void write_callback(unsigned long error, void *context) | |||
524 | * This way we handle both writes to SYNC and NOSYNC | 593 | * This way we handle both writes to SYNC and NOSYNC |
525 | * regions with the same code. | 594 | * regions with the same code. |
526 | */ | 595 | */ |
527 | if (likely(!error)) | 596 | if (likely(!error)) { |
528 | goto out; | 597 | bio_endio(bio, ret); |
598 | return; | ||
599 | } | ||
529 | 600 | ||
530 | for (i = 0; i < ms->nr_mirrors; i++) | 601 | for (i = 0; i < ms->nr_mirrors; i++) |
531 | if (test_bit(i, &error)) | 602 | if (test_bit(i, &error)) |
532 | fail_mirror(ms->mirror + i, DM_RAID1_WRITE_ERROR); | 603 | fail_mirror(ms->mirror + i, DM_RAID1_WRITE_ERROR); |
533 | else | ||
534 | uptodate = 1; | ||
535 | 604 | ||
536 | if (unlikely(!uptodate)) { | 605 | /* |
537 | DMERR("All replicated volumes dead, failing I/O"); | 606 | * Need to raise event. Since raising |
538 | /* None of the writes succeeded, fail the I/O. */ | 607 | * events can block, we need to do it in |
539 | ret = -EIO; | 608 | * the main thread. |
540 | } else if (errors_handled(ms)) { | 609 | */ |
541 | /* | 610 | spin_lock_irqsave(&ms->lock, flags); |
542 | * Need to raise event. Since raising | 611 | if (!ms->failures.head) |
543 | * events can block, we need to do it in | 612 | should_wake = 1; |
544 | * the main thread. | 613 | bio_list_add(&ms->failures, bio); |
545 | */ | 614 | spin_unlock_irqrestore(&ms->lock, flags); |
546 | spin_lock_irqsave(&ms->lock, flags); | 615 | if (should_wake) |
547 | if (!ms->failures.head) | 616 | wakeup_mirrord(ms); |
548 | should_wake = 1; | ||
549 | bio_list_add(&ms->failures, bio); | ||
550 | spin_unlock_irqrestore(&ms->lock, flags); | ||
551 | if (should_wake) | ||
552 | wakeup_mirrord(ms); | ||
553 | return; | ||
554 | } | ||
555 | out: | ||
556 | bio_endio(bio, ret); | ||
557 | } | 617 | } |
558 | 618 | ||
559 | static void do_write(struct mirror_set *ms, struct bio *bio) | 619 | static void do_write(struct mirror_set *ms, struct bio *bio) |
@@ -562,7 +622,7 @@ static void do_write(struct mirror_set *ms, struct bio *bio) | |||
562 | struct dm_io_region io[ms->nr_mirrors], *dest = io; | 622 | struct dm_io_region io[ms->nr_mirrors], *dest = io; |
563 | struct mirror *m; | 623 | struct mirror *m; |
564 | struct dm_io_request io_req = { | 624 | struct dm_io_request io_req = { |
565 | .bi_rw = WRITE, | 625 | .bi_rw = WRITE | (bio->bi_rw & WRITE_BARRIER), |
566 | .mem.type = DM_IO_BVEC, | 626 | .mem.type = DM_IO_BVEC, |
567 | .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx, | 627 | .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx, |
568 | .notify.fn = write_callback, | 628 | .notify.fn = write_callback, |
@@ -603,6 +663,11 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) | |||
603 | bio_list_init(&requeue); | 663 | bio_list_init(&requeue); |
604 | 664 | ||
605 | while ((bio = bio_list_pop(writes))) { | 665 | while ((bio = bio_list_pop(writes))) { |
666 | if (unlikely(bio_empty_barrier(bio))) { | ||
667 | bio_list_add(&sync, bio); | ||
668 | continue; | ||
669 | } | ||
670 | |||
606 | region = dm_rh_bio_to_region(ms->rh, bio); | 671 | region = dm_rh_bio_to_region(ms->rh, bio); |
607 | 672 | ||
608 | if (log->type->is_remote_recovering && | 673 | if (log->type->is_remote_recovering && |
@@ -672,8 +737,12 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) | |||
672 | dm_rh_delay(ms->rh, bio); | 737 | dm_rh_delay(ms->rh, bio); |
673 | 738 | ||
674 | while ((bio = bio_list_pop(&nosync))) { | 739 | while ((bio = bio_list_pop(&nosync))) { |
675 | map_bio(get_default_mirror(ms), bio); | 740 | if (unlikely(ms->leg_failure) && errors_handled(ms)) |
676 | generic_make_request(bio); | 741 | hold_bio(ms, bio); |
742 | else { | ||
743 | map_bio(get_default_mirror(ms), bio); | ||
744 | generic_make_request(bio); | ||
745 | } | ||
677 | } | 746 | } |
678 | } | 747 | } |
679 | 748 | ||
@@ -681,20 +750,12 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures) | |||
681 | { | 750 | { |
682 | struct bio *bio; | 751 | struct bio *bio; |
683 | 752 | ||
684 | if (!failures->head) | 753 | if (likely(!failures->head)) |
685 | return; | ||
686 | |||
687 | if (!ms->log_failure) { | ||
688 | while ((bio = bio_list_pop(failures))) { | ||
689 | ms->in_sync = 0; | ||
690 | dm_rh_mark_nosync(ms->rh, bio, bio->bi_size, 0); | ||
691 | } | ||
692 | return; | 754 | return; |
693 | } | ||
694 | 755 | ||
695 | /* | 756 | /* |
696 | * If the log has failed, unattempted writes are being | 757 | * If the log has failed, unattempted writes are being |
697 | * put on the failures list. We can't issue those writes | 758 | * put on the holds list. We can't issue those writes |
698 | * until a log has been marked, so we must store them. | 759 | * until a log has been marked, so we must store them. |
699 | * | 760 | * |
700 | * If a 'noflush' suspend is in progress, we can requeue | 761 | * If a 'noflush' suspend is in progress, we can requeue |
@@ -709,23 +770,27 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures) | |||
709 | * for us to treat them the same and requeue them | 770 | * for us to treat them the same and requeue them |
710 | * as well. | 771 | * as well. |
711 | */ | 772 | */ |
712 | if (dm_noflush_suspending(ms->ti)) { | 773 | while ((bio = bio_list_pop(failures))) { |
713 | while ((bio = bio_list_pop(failures))) | 774 | if (!ms->log_failure) { |
714 | bio_endio(bio, DM_ENDIO_REQUEUE); | 775 | ms->in_sync = 0; |
715 | return; | 776 | dm_rh_mark_nosync(ms->rh, bio); |
716 | } | 777 | } |
717 | 778 | ||
718 | if (atomic_read(&ms->suspend)) { | 779 | /* |
719 | while ((bio = bio_list_pop(failures))) | 780 | * If all the legs are dead, fail the I/O. |
781 | * If we have been told to handle errors, hold the bio | ||
782 | * and wait for userspace to deal with the problem. | ||
783 | * Otherwise pretend that the I/O succeeded. (This would | ||
784 | * be wrong if the failed leg returned after reboot and | ||
785 | * got replicated back to the good legs.) | ||
786 | */ | ||
787 | if (!get_valid_mirror(ms)) | ||
720 | bio_endio(bio, -EIO); | 788 | bio_endio(bio, -EIO); |
721 | return; | 789 | else if (errors_handled(ms)) |
790 | hold_bio(ms, bio); | ||
791 | else | ||
792 | bio_endio(bio, 0); | ||
722 | } | 793 | } |
723 | |||
724 | spin_lock_irq(&ms->lock); | ||
725 | bio_list_merge(&ms->failures, failures); | ||
726 | spin_unlock_irq(&ms->lock); | ||
727 | |||
728 | delayed_wake(ms); | ||
729 | } | 794 | } |
730 | 795 | ||
731 | static void trigger_event(struct work_struct *work) | 796 | static void trigger_event(struct work_struct *work) |
@@ -784,12 +849,17 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors, | |||
784 | } | 849 | } |
785 | 850 | ||
786 | spin_lock_init(&ms->lock); | 851 | spin_lock_init(&ms->lock); |
852 | bio_list_init(&ms->reads); | ||
853 | bio_list_init(&ms->writes); | ||
854 | bio_list_init(&ms->failures); | ||
855 | bio_list_init(&ms->holds); | ||
787 | 856 | ||
788 | ms->ti = ti; | 857 | ms->ti = ti; |
789 | ms->nr_mirrors = nr_mirrors; | 858 | ms->nr_mirrors = nr_mirrors; |
790 | ms->nr_regions = dm_sector_div_up(ti->len, region_size); | 859 | ms->nr_regions = dm_sector_div_up(ti->len, region_size); |
791 | ms->in_sync = 0; | 860 | ms->in_sync = 0; |
792 | ms->log_failure = 0; | 861 | ms->log_failure = 0; |
862 | ms->leg_failure = 0; | ||
793 | atomic_set(&ms->suspend, 0); | 863 | atomic_set(&ms->suspend, 0); |
794 | atomic_set(&ms->default_mirror, DEFAULT_MIRROR); | 864 | atomic_set(&ms->default_mirror, DEFAULT_MIRROR); |
795 | 865 | ||
@@ -889,7 +959,8 @@ static struct dm_dirty_log *create_dirty_log(struct dm_target *ti, | |||
889 | return NULL; | 959 | return NULL; |
890 | } | 960 | } |
891 | 961 | ||
892 | dl = dm_dirty_log_create(argv[0], ti, param_count, argv + 2); | 962 | dl = dm_dirty_log_create(argv[0], ti, mirror_flush, param_count, |
963 | argv + 2); | ||
893 | if (!dl) { | 964 | if (!dl) { |
894 | ti->error = "Error creating mirror dirty log"; | 965 | ti->error = "Error creating mirror dirty log"; |
895 | return NULL; | 966 | return NULL; |
@@ -995,6 +1066,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
995 | 1066 | ||
996 | ti->private = ms; | 1067 | ti->private = ms; |
997 | ti->split_io = dm_rh_get_region_size(ms->rh); | 1068 | ti->split_io = dm_rh_get_region_size(ms->rh); |
1069 | ti->num_flush_requests = 1; | ||
998 | 1070 | ||
999 | ms->kmirrord_wq = create_singlethread_workqueue("kmirrord"); | 1071 | ms->kmirrord_wq = create_singlethread_workqueue("kmirrord"); |
1000 | if (!ms->kmirrord_wq) { | 1072 | if (!ms->kmirrord_wq) { |
@@ -1122,7 +1194,8 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, | |||
1122 | * We need to dec pending if this was a write. | 1194 | * We need to dec pending if this was a write. |
1123 | */ | 1195 | */ |
1124 | if (rw == WRITE) { | 1196 | if (rw == WRITE) { |
1125 | dm_rh_dec(ms->rh, map_context->ll); | 1197 | if (likely(!bio_empty_barrier(bio))) |
1198 | dm_rh_dec(ms->rh, map_context->ll); | ||
1126 | return error; | 1199 | return error; |
1127 | } | 1200 | } |
1128 | 1201 | ||
@@ -1180,6 +1253,9 @@ static void mirror_presuspend(struct dm_target *ti) | |||
1180 | struct mirror_set *ms = (struct mirror_set *) ti->private; | 1253 | struct mirror_set *ms = (struct mirror_set *) ti->private; |
1181 | struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); | 1254 | struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); |
1182 | 1255 | ||
1256 | struct bio_list holds; | ||
1257 | struct bio *bio; | ||
1258 | |||
1183 | atomic_set(&ms->suspend, 1); | 1259 | atomic_set(&ms->suspend, 1); |
1184 | 1260 | ||
1185 | /* | 1261 | /* |
@@ -1202,6 +1278,22 @@ static void mirror_presuspend(struct dm_target *ti) | |||
1202 | * we know that all of our I/O has been pushed. | 1278 | * we know that all of our I/O has been pushed. |
1203 | */ | 1279 | */ |
1204 | flush_workqueue(ms->kmirrord_wq); | 1280 | flush_workqueue(ms->kmirrord_wq); |
1281 | |||
1282 | /* | ||
1283 | * Now set ms->suspend is set and the workqueue flushed, no more | ||
1284 | * entries can be added to ms->hold list, so process it. | ||
1285 | * | ||
1286 | * Bios can still arrive concurrently with or after this | ||
1287 | * presuspend function, but they cannot join the hold list | ||
1288 | * because ms->suspend is set. | ||
1289 | */ | ||
1290 | spin_lock_irq(&ms->lock); | ||
1291 | holds = ms->holds; | ||
1292 | bio_list_init(&ms->holds); | ||
1293 | spin_unlock_irq(&ms->lock); | ||
1294 | |||
1295 | while ((bio = bio_list_pop(&holds))) | ||
1296 | hold_bio(ms, bio); | ||
1205 | } | 1297 | } |
1206 | 1298 | ||
1207 | static void mirror_postsuspend(struct dm_target *ti) | 1299 | static void mirror_postsuspend(struct dm_target *ti) |
@@ -1244,7 +1336,8 @@ static char device_status_char(struct mirror *m) | |||
1244 | if (!atomic_read(&(m->error_count))) | 1336 | if (!atomic_read(&(m->error_count))) |
1245 | return 'A'; | 1337 | return 'A'; |
1246 | 1338 | ||
1247 | return (test_bit(DM_RAID1_WRITE_ERROR, &(m->error_type))) ? 'D' : | 1339 | return (test_bit(DM_RAID1_FLUSH_ERROR, &(m->error_type))) ? 'F' : |
1340 | (test_bit(DM_RAID1_WRITE_ERROR, &(m->error_type))) ? 'D' : | ||
1248 | (test_bit(DM_RAID1_SYNC_ERROR, &(m->error_type))) ? 'S' : | 1341 | (test_bit(DM_RAID1_SYNC_ERROR, &(m->error_type))) ? 'S' : |
1249 | (test_bit(DM_RAID1_READ_ERROR, &(m->error_type))) ? 'R' : 'U'; | 1342 | (test_bit(DM_RAID1_READ_ERROR, &(m->error_type))) ? 'R' : 'U'; |
1250 | } | 1343 | } |