diff options
author | Jonathan Brassow <jbrassow@redhat.com> | 2008-02-07 21:11:29 -0500 |
---|---|---|
committer | Alasdair G Kergon <agk@redhat.com> | 2008-02-07 21:11:29 -0500 |
commit | 72f4b314100bae85c75d8e4c6fec621ab44e777d (patch) | |
tree | 6dc5e860e8a4acab2e047f31391d0c8fdf366ff7 /drivers/md | |
parent | d74f81f8adc504a23be3babf347b9f69e9389924 (diff) |
dm raid1: handle write failures
This patch gives mirror the ability to handle device failures
during normal write operations.
The 'write_callback' function is called when a write completes.
If all the writes failed or succeeded, we report failure or
success respectively. If some of the writes failed, we call
fail_mirror; which increments the error count for the device, notes
the type of error encountered (DM_RAID1_WRITE_ERROR), and
selects a new primary (if necessary). Note that the primary
device can never change while the mirror is not in-sync (IOW,
while recovery is happening.) This means that the scenario
where a failed write changes the primary and gives
recovery_complete a chance to misread the primary never happens.
The fact that the primary can change has necessitated the change
to the default_mirror field. We need to protect against reading
garbage while the primary changes. We then add the bio to a new
list in the mirror set, 'failures'. For every bio in the 'failures'
list, we call a new function, '__bio_mark_nosync', where we mark
the region 'not-in-sync' in the log and properly set the region
state as, RH_NOSYNC. Userspace must also be notified of the
failure. This is done by 'raising an event' (dm_table_event()).
If fail_mirror is called in process context the event can be raised
right away. If in interrupt context, the event is deferred to the
kmirrord thread - which raises the event if 'event_waiting' is set.
Backwards compatibility is maintained by ignoring errors if
the DM_FEATURES_HANDLE_ERRORS flag is not present.
Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/dm-raid1.c | 250 |
1 files changed, 224 insertions, 26 deletions
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 31123d4a6b9c..4e1e04dbc4ab 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c | |||
@@ -20,6 +20,7 @@ | |||
20 | #include <linux/vmalloc.h> | 20 | #include <linux/vmalloc.h> |
21 | #include <linux/workqueue.h> | 21 | #include <linux/workqueue.h> |
22 | #include <linux/log2.h> | 22 | #include <linux/log2.h> |
23 | #include <linux/hardirq.h> | ||
23 | 24 | ||
24 | #define DM_MSG_PREFIX "raid1" | 25 | #define DM_MSG_PREFIX "raid1" |
25 | #define DM_IO_PAGES 64 | 26 | #define DM_IO_PAGES 64 |
@@ -113,9 +114,16 @@ struct region { | |||
113 | /*----------------------------------------------------------------- | 114 | /*----------------------------------------------------------------- |
114 | * Mirror set structures. | 115 | * Mirror set structures. |
115 | *---------------------------------------------------------------*/ | 116 | *---------------------------------------------------------------*/ |
117 | enum dm_raid1_error { | ||
118 | DM_RAID1_WRITE_ERROR, | ||
119 | DM_RAID1_SYNC_ERROR, | ||
120 | DM_RAID1_READ_ERROR | ||
121 | }; | ||
122 | |||
116 | struct mirror { | 123 | struct mirror { |
117 | struct mirror_set *ms; | 124 | struct mirror_set *ms; |
118 | atomic_t error_count; | 125 | atomic_t error_count; |
126 | uint32_t error_type; | ||
119 | struct dm_dev *dev; | 127 | struct dm_dev *dev; |
120 | sector_t offset; | 128 | sector_t offset; |
121 | }; | 129 | }; |
@@ -127,9 +135,10 @@ struct mirror_set { | |||
127 | struct kcopyd_client *kcopyd_client; | 135 | struct kcopyd_client *kcopyd_client; |
128 | uint64_t features; | 136 | uint64_t features; |
129 | 137 | ||
130 | spinlock_t lock; /* protects the next two lists */ | 138 | spinlock_t lock; /* protects the lists */ |
131 | struct bio_list reads; | 139 | struct bio_list reads; |
132 | struct bio_list writes; | 140 | struct bio_list writes; |
141 | struct bio_list failures; | ||
133 | 142 | ||
134 | struct dm_io_client *io_client; | 143 | struct dm_io_client *io_client; |
135 | 144 | ||
@@ -138,10 +147,11 @@ struct mirror_set { | |||
138 | int in_sync; | 147 | int in_sync; |
139 | int log_failure; | 148 | int log_failure; |
140 | 149 | ||
141 | struct mirror *default_mirror; /* Default mirror */ | 150 | atomic_t default_mirror; /* Default mirror */ |
142 | 151 | ||
143 | struct workqueue_struct *kmirrord_wq; | 152 | struct workqueue_struct *kmirrord_wq; |
144 | struct work_struct kmirrord_work; | 153 | struct work_struct kmirrord_work; |
154 | struct work_struct trigger_event; | ||
145 | 155 | ||
146 | unsigned int nr_mirrors; | 156 | unsigned int nr_mirrors; |
147 | struct mirror mirror[0]; | 157 | struct mirror mirror[0]; |
@@ -646,6 +656,77 @@ static void bio_set_ms(struct bio *bio, struct mirror_set *ms) | |||
646 | bio->bi_next = (struct bio *) ms; | 656 | bio->bi_next = (struct bio *) ms; |
647 | } | 657 | } |
648 | 658 | ||
659 | static struct mirror *get_default_mirror(struct mirror_set *ms) | ||
660 | { | ||
661 | return &ms->mirror[atomic_read(&ms->default_mirror)]; | ||
662 | } | ||
663 | |||
664 | static void set_default_mirror(struct mirror *m) | ||
665 | { | ||
666 | struct mirror_set *ms = m->ms; | ||
667 | struct mirror *m0 = &(ms->mirror[0]); | ||
668 | |||
669 | atomic_set(&ms->default_mirror, m - m0); | ||
670 | } | ||
671 | |||
672 | /* fail_mirror | ||
673 | * @m: mirror device to fail | ||
674 | * @error_type: one of the enum's, DM_RAID1_*_ERROR | ||
675 | * | ||
676 | * If errors are being handled, record the type of | ||
677 | * error encountered for this device. If this type | ||
678 | * of error has already been recorded, we can return; | ||
679 | * otherwise, we must signal userspace by triggering | ||
680 | * an event. Additionally, if the device is the | ||
681 | * primary device, we must choose a new primary, but | ||
682 | * only if the mirror is in-sync. | ||
683 | * | ||
684 | * This function must not block. | ||
685 | */ | ||
686 | static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type) | ||
687 | { | ||
688 | struct mirror_set *ms = m->ms; | ||
689 | struct mirror *new; | ||
690 | |||
691 | if (!errors_handled(ms)) | ||
692 | return; | ||
693 | |||
694 | /* | ||
695 | * error_count is used for nothing more than a | ||
696 | * simple way to tell if a device has encountered | ||
697 | * errors. | ||
698 | */ | ||
699 | atomic_inc(&m->error_count); | ||
700 | |||
701 | if (test_and_set_bit(error_type, &m->error_type)) | ||
702 | return; | ||
703 | |||
704 | if (m != get_default_mirror(ms)) | ||
705 | goto out; | ||
706 | |||
707 | if (!ms->in_sync) { | ||
708 | /* | ||
709 | * Better to issue requests to same failing device | ||
710 | * than to risk returning corrupt data. | ||
711 | */ | ||
712 | DMERR("Primary mirror (%s) failed while out-of-sync: " | ||
713 | "Reads may fail.", m->dev->name); | ||
714 | goto out; | ||
715 | } | ||
716 | |||
717 | for (new = ms->mirror; new < ms->mirror + ms->nr_mirrors; new++) | ||
718 | if (!atomic_read(&new->error_count)) { | ||
719 | set_default_mirror(new); | ||
720 | break; | ||
721 | } | ||
722 | |||
723 | if (unlikely(new == ms->mirror + ms->nr_mirrors)) | ||
724 | DMWARN("All sides of mirror have failed."); | ||
725 | |||
726 | out: | ||
727 | schedule_work(&ms->trigger_event); | ||
728 | } | ||
729 | |||
649 | /*----------------------------------------------------------------- | 730 | /*----------------------------------------------------------------- |
650 | * Recovery. | 731 | * Recovery. |
651 | * | 732 | * |
@@ -678,7 +759,7 @@ static int recover(struct mirror_set *ms, struct region *reg) | |||
678 | unsigned long flags = 0; | 759 | unsigned long flags = 0; |
679 | 760 | ||
680 | /* fill in the source */ | 761 | /* fill in the source */ |
681 | m = ms->default_mirror; | 762 | m = get_default_mirror(ms); |
682 | from.bdev = m->dev->bdev; | 763 | from.bdev = m->dev->bdev; |
683 | from.sector = m->offset + region_to_sector(reg->rh, reg->key); | 764 | from.sector = m->offset + region_to_sector(reg->rh, reg->key); |
684 | if (reg->key == (ms->nr_regions - 1)) { | 765 | if (reg->key == (ms->nr_regions - 1)) { |
@@ -694,7 +775,7 @@ static int recover(struct mirror_set *ms, struct region *reg) | |||
694 | 775 | ||
695 | /* fill in the destinations */ | 776 | /* fill in the destinations */ |
696 | for (i = 0, dest = to; i < ms->nr_mirrors; i++) { | 777 | for (i = 0, dest = to; i < ms->nr_mirrors; i++) { |
697 | if (&ms->mirror[i] == ms->default_mirror) | 778 | if (&ms->mirror[i] == get_default_mirror(ms)) |
698 | continue; | 779 | continue; |
699 | 780 | ||
700 | m = ms->mirror + i; | 781 | m = ms->mirror + i; |
@@ -749,7 +830,7 @@ static void do_recovery(struct mirror_set *ms) | |||
749 | static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector) | 830 | static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector) |
750 | { | 831 | { |
751 | /* FIXME: add read balancing */ | 832 | /* FIXME: add read balancing */ |
752 | return ms->default_mirror; | 833 | return get_default_mirror(ms); |
753 | } | 834 | } |
754 | 835 | ||
755 | /* | 836 | /* |
@@ -776,7 +857,7 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads) | |||
776 | if (rh_in_sync(&ms->rh, region, 1)) | 857 | if (rh_in_sync(&ms->rh, region, 1)) |
777 | m = choose_mirror(ms, bio->bi_sector); | 858 | m = choose_mirror(ms, bio->bi_sector); |
778 | else | 859 | else |
779 | m = ms->default_mirror; | 860 | m = get_default_mirror(ms); |
780 | 861 | ||
781 | map_bio(ms, m, bio); | 862 | map_bio(ms, m, bio); |
782 | generic_make_request(bio); | 863 | generic_make_request(bio); |
@@ -793,12 +874,67 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads) | |||
793 | * RECOVERING: delay the io until recovery completes | 874 | * RECOVERING: delay the io until recovery completes |
794 | * NOSYNC: increment pending, just write to the default mirror | 875 | * NOSYNC: increment pending, just write to the default mirror |
795 | *---------------------------------------------------------------*/ | 876 | *---------------------------------------------------------------*/ |
877 | |||
878 | /* __bio_mark_nosync | ||
879 | * @ms | ||
880 | * @bio | ||
881 | * @done | ||
882 | * @error | ||
883 | * | ||
884 | * The bio was written on some mirror(s) but failed on other mirror(s). | ||
885 | * We can successfully endio the bio but should avoid the region being | ||
886 | * marked clean by setting the state RH_NOSYNC. | ||
887 | * | ||
888 | * This function is _not_ safe in interrupt context! | ||
889 | */ | ||
890 | static void __bio_mark_nosync(struct mirror_set *ms, | ||
891 | struct bio *bio, unsigned done, int error) | ||
892 | { | ||
893 | unsigned long flags; | ||
894 | struct region_hash *rh = &ms->rh; | ||
895 | struct dirty_log *log = ms->rh.log; | ||
896 | struct region *reg; | ||
897 | region_t region = bio_to_region(rh, bio); | ||
898 | int recovering = 0; | ||
899 | |||
900 | /* We must inform the log that the sync count has changed. */ | ||
901 | log->type->set_region_sync(log, region, 0); | ||
902 | ms->in_sync = 0; | ||
903 | |||
904 | read_lock(&rh->hash_lock); | ||
905 | reg = __rh_find(rh, region); | ||
906 | read_unlock(&rh->hash_lock); | ||
907 | |||
908 | /* region hash entry should exist because write was in-flight */ | ||
909 | BUG_ON(!reg); | ||
910 | BUG_ON(!list_empty(®->list)); | ||
911 | |||
912 | spin_lock_irqsave(&rh->region_lock, flags); | ||
913 | /* | ||
914 | * Possible cases: | ||
915 | * 1) RH_DIRTY | ||
916 | * 2) RH_NOSYNC: was dirty, other preceeding writes failed | ||
917 | * 3) RH_RECOVERING: flushing pending writes | ||
918 | * Either case, the region should have not been connected to list. | ||
919 | */ | ||
920 | recovering = (reg->state == RH_RECOVERING); | ||
921 | reg->state = RH_NOSYNC; | ||
922 | BUG_ON(!list_empty(®->list)); | ||
923 | spin_unlock_irqrestore(&rh->region_lock, flags); | ||
924 | |||
925 | bio_endio(bio, error); | ||
926 | if (recovering) | ||
927 | complete_resync_work(reg, 0); | ||
928 | } | ||
929 | |||
796 | static void write_callback(unsigned long error, void *context) | 930 | static void write_callback(unsigned long error, void *context) |
797 | { | 931 | { |
798 | unsigned int i; | 932 | unsigned i, ret = 0; |
799 | int uptodate = 1; | ||
800 | struct bio *bio = (struct bio *) context; | 933 | struct bio *bio = (struct bio *) context; |
801 | struct mirror_set *ms; | 934 | struct mirror_set *ms; |
935 | int uptodate = 0; | ||
936 | int should_wake = 0; | ||
937 | unsigned long flags; | ||
802 | 938 | ||
803 | ms = bio_get_ms(bio); | 939 | ms = bio_get_ms(bio); |
804 | bio_set_ms(bio, NULL); | 940 | bio_set_ms(bio, NULL); |
@@ -809,20 +945,36 @@ static void write_callback(unsigned long error, void *context) | |||
809 | * This way we handle both writes to SYNC and NOSYNC | 945 | * This way we handle both writes to SYNC and NOSYNC |
810 | * regions with the same code. | 946 | * regions with the same code. |
811 | */ | 947 | */ |
948 | if (likely(!error)) | ||
949 | goto out; | ||
812 | 950 | ||
813 | if (error) { | 951 | for (i = 0; i < ms->nr_mirrors; i++) |
952 | if (test_bit(i, &error)) | ||
953 | fail_mirror(ms->mirror + i, DM_RAID1_WRITE_ERROR); | ||
954 | else | ||
955 | uptodate = 1; | ||
956 | |||
957 | if (unlikely(!uptodate)) { | ||
958 | DMERR("All replicated volumes dead, failing I/O"); | ||
959 | /* None of the writes succeeded, fail the I/O. */ | ||
960 | ret = -EIO; | ||
961 | } else if (errors_handled(ms)) { | ||
814 | /* | 962 | /* |
815 | * only error the io if all mirrors failed. | 963 | * Need to raise event. Since raising |
816 | * FIXME: bogus | 964 | * events can block, we need to do it in |
965 | * the main thread. | ||
817 | */ | 966 | */ |
818 | uptodate = 0; | 967 | spin_lock_irqsave(&ms->lock, flags); |
819 | for (i = 0; i < ms->nr_mirrors; i++) | 968 | if (!ms->failures.head) |
820 | if (!test_bit(i, &error)) { | 969 | should_wake = 1; |
821 | uptodate = 1; | 970 | bio_list_add(&ms->failures, bio); |
822 | break; | 971 | spin_unlock_irqrestore(&ms->lock, flags); |
823 | } | 972 | if (should_wake) |
973 | wake(ms); | ||
974 | return; | ||
824 | } | 975 | } |
825 | bio_endio(bio, 0); | 976 | out: |
977 | bio_endio(bio, ret); | ||
826 | } | 978 | } |
827 | 979 | ||
828 | static void do_write(struct mirror_set *ms, struct bio *bio) | 980 | static void do_write(struct mirror_set *ms, struct bio *bio) |
@@ -910,33 +1062,75 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) | |||
910 | rh_delay(&ms->rh, bio); | 1062 | rh_delay(&ms->rh, bio); |
911 | 1063 | ||
912 | while ((bio = bio_list_pop(&nosync))) { | 1064 | while ((bio = bio_list_pop(&nosync))) { |
913 | map_bio(ms, ms->default_mirror, bio); | 1065 | map_bio(ms, get_default_mirror(ms), bio); |
914 | generic_make_request(bio); | 1066 | generic_make_request(bio); |
915 | } | 1067 | } |
916 | } | 1068 | } |
917 | 1069 | ||
1070 | static void do_failures(struct mirror_set *ms, struct bio_list *failures) | ||
1071 | { | ||
1072 | struct bio *bio; | ||
1073 | |||
1074 | if (!failures->head) | ||
1075 | return; | ||
1076 | |||
1077 | while ((bio = bio_list_pop(failures))) | ||
1078 | __bio_mark_nosync(ms, bio, bio->bi_size, 0); | ||
1079 | } | ||
1080 | |||
1081 | static void trigger_event(struct work_struct *work) | ||
1082 | { | ||
1083 | struct mirror_set *ms = | ||
1084 | container_of(work, struct mirror_set, trigger_event); | ||
1085 | |||
1086 | dm_table_event(ms->ti->table); | ||
1087 | } | ||
1088 | |||
918 | /*----------------------------------------------------------------- | 1089 | /*----------------------------------------------------------------- |
919 | * kmirrord | 1090 | * kmirrord |
920 | *---------------------------------------------------------------*/ | 1091 | *---------------------------------------------------------------*/ |
921 | static void do_mirror(struct work_struct *work) | 1092 | static int _do_mirror(struct work_struct *work) |
922 | { | 1093 | { |
923 | struct mirror_set *ms =container_of(work, struct mirror_set, | 1094 | struct mirror_set *ms =container_of(work, struct mirror_set, |
924 | kmirrord_work); | 1095 | kmirrord_work); |
925 | struct bio_list reads, writes; | 1096 | struct bio_list reads, writes, failures; |
1097 | unsigned long flags; | ||
926 | 1098 | ||
927 | spin_lock(&ms->lock); | 1099 | spin_lock_irqsave(&ms->lock, flags); |
928 | reads = ms->reads; | 1100 | reads = ms->reads; |
929 | writes = ms->writes; | 1101 | writes = ms->writes; |
1102 | failures = ms->failures; | ||
930 | bio_list_init(&ms->reads); | 1103 | bio_list_init(&ms->reads); |
931 | bio_list_init(&ms->writes); | 1104 | bio_list_init(&ms->writes); |
932 | spin_unlock(&ms->lock); | 1105 | bio_list_init(&ms->failures); |
1106 | spin_unlock_irqrestore(&ms->lock, flags); | ||
933 | 1107 | ||
934 | rh_update_states(&ms->rh); | 1108 | rh_update_states(&ms->rh); |
935 | do_recovery(ms); | 1109 | do_recovery(ms); |
936 | do_reads(ms, &reads); | 1110 | do_reads(ms, &reads); |
937 | do_writes(ms, &writes); | 1111 | do_writes(ms, &writes); |
1112 | do_failures(ms, &failures); | ||
1113 | |||
1114 | return (ms->failures.head) ? 1 : 0; | ||
1115 | } | ||
1116 | |||
1117 | static void do_mirror(struct work_struct *work) | ||
1118 | { | ||
1119 | /* | ||
1120 | * If _do_mirror returns 1, we give it | ||
1121 | * another shot. This helps for cases like | ||
1122 | * 'suspend' where we call flush_workqueue | ||
1123 | * and expect all work to be finished. If | ||
1124 | * a failure happens during a suspend, we | ||
1125 | * couldn't issue a 'wake' because it would | ||
1126 | * not be honored. Therefore, we return '1' | ||
1127 | * from _do_mirror, and retry here. | ||
1128 | */ | ||
1129 | while (_do_mirror(work)) | ||
1130 | schedule(); | ||
938 | } | 1131 | } |
939 | 1132 | ||
1133 | |||
940 | /*----------------------------------------------------------------- | 1134 | /*----------------------------------------------------------------- |
941 | * Target functions | 1135 | * Target functions |
942 | *---------------------------------------------------------------*/ | 1136 | *---------------------------------------------------------------*/ |
@@ -965,7 +1159,7 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors, | |||
965 | ms->nr_mirrors = nr_mirrors; | 1159 | ms->nr_mirrors = nr_mirrors; |
966 | ms->nr_regions = dm_sector_div_up(ti->len, region_size); | 1160 | ms->nr_regions = dm_sector_div_up(ti->len, region_size); |
967 | ms->in_sync = 0; | 1161 | ms->in_sync = 0; |
968 | ms->default_mirror = &ms->mirror[DEFAULT_MIRROR]; | 1162 | atomic_set(&ms->default_mirror, DEFAULT_MIRROR); |
969 | 1163 | ||
970 | ms->io_client = dm_io_client_create(DM_IO_PAGES); | 1164 | ms->io_client = dm_io_client_create(DM_IO_PAGES); |
971 | if (IS_ERR(ms->io_client)) { | 1165 | if (IS_ERR(ms->io_client)) { |
@@ -1019,6 +1213,8 @@ static int get_mirror(struct mirror_set *ms, struct dm_target *ti, | |||
1019 | } | 1213 | } |
1020 | 1214 | ||
1021 | ms->mirror[mirror].ms = ms; | 1215 | ms->mirror[mirror].ms = ms; |
1216 | atomic_set(&(ms->mirror[mirror].error_count), 0); | ||
1217 | ms->mirror[mirror].error_type = 0; | ||
1022 | ms->mirror[mirror].offset = offset; | 1218 | ms->mirror[mirror].offset = offset; |
1023 | 1219 | ||
1024 | return 0; | 1220 | return 0; |
@@ -1171,6 +1367,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1171 | goto err_free_context; | 1367 | goto err_free_context; |
1172 | } | 1368 | } |
1173 | INIT_WORK(&ms->kmirrord_work, do_mirror); | 1369 | INIT_WORK(&ms->kmirrord_work, do_mirror); |
1370 | INIT_WORK(&ms->trigger_event, trigger_event); | ||
1174 | 1371 | ||
1175 | r = parse_features(ms, argc, argv, &args_used); | 1372 | r = parse_features(ms, argc, argv, &args_used); |
1176 | if (r) | 1373 | if (r) |
@@ -1220,14 +1417,15 @@ static void mirror_dtr(struct dm_target *ti) | |||
1220 | 1417 | ||
1221 | static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw) | 1418 | static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw) |
1222 | { | 1419 | { |
1420 | unsigned long flags; | ||
1223 | int should_wake = 0; | 1421 | int should_wake = 0; |
1224 | struct bio_list *bl; | 1422 | struct bio_list *bl; |
1225 | 1423 | ||
1226 | bl = (rw == WRITE) ? &ms->writes : &ms->reads; | 1424 | bl = (rw == WRITE) ? &ms->writes : &ms->reads; |
1227 | spin_lock(&ms->lock); | 1425 | spin_lock_irqsave(&ms->lock, flags); |
1228 | should_wake = !(bl->head); | 1426 | should_wake = !(bl->head); |
1229 | bio_list_add(bl, bio); | 1427 | bio_list_add(bl, bio); |
1230 | spin_unlock(&ms->lock); | 1428 | spin_unlock_irqrestore(&ms->lock, flags); |
1231 | 1429 | ||
1232 | if (should_wake) | 1430 | if (should_wake) |
1233 | wake(ms); | 1431 | wake(ms); |