aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorJonathan Brassow <jbrassow@redhat.com>2008-02-07 21:11:37 -0500
committerAlasdair G Kergon <agk@redhat.com>2008-02-07 21:11:37 -0500
commit06386bbfd2441416875d0403d405c56822f6ebac (patch)
treece66e5d061f67df6e85854aafaf0c43620513359 /drivers/md
parentb80aa7a0c268d3ae0c472f648af1e3e4a359765c (diff)
dm raid1: handle read failures
This patch gives the ability to respond-to/record device failures that happen during read operations. It also adds the ability to read from mirror devices that are not the primary if they are in-sync. There are essentially two read paths in mirroring; the direct path and the queued path. When a read request is mapped, if the region is 'in-sync' the direct path is taken; otherwise the queued path is taken. If the direct path is taken, we must record bio information so that if the read fails we can retry it. We then discover the status of a direct read through mirror_end_io. If the read has failed, we will mark the device from which the read was attempted as failed (so we don't try to read from it again), restore the bio and try again. If the queued path is taken, we discover the results of the read from 'read_callback'. If the device failed, we will mark the device as failed and attempt the read again if there is another device where this region is known to be 'in-sync'. Signed-off-by: Jonathan Brassow <jbrassow@redhat.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/dm-raid1.c256
1 files changed, 211 insertions, 45 deletions
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index ec6d675bf766..38efa7071dd7 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -6,6 +6,7 @@
6 6
7#include "dm.h" 7#include "dm.h"
8#include "dm-bio-list.h" 8#include "dm-bio-list.h"
9#include "dm-bio-record.h"
9#include "dm-io.h" 10#include "dm-io.h"
10#include "dm-log.h" 11#include "dm-log.h"
11#include "kcopyd.h" 12#include "kcopyd.h"
@@ -141,6 +142,7 @@ struct mirror_set {
141 struct bio_list failures; 142 struct bio_list failures;
142 143
143 struct dm_io_client *io_client; 144 struct dm_io_client *io_client;
145 mempool_t *read_record_pool;
144 146
145 /* recovery */ 147 /* recovery */
146 region_t nr_regions; 148 region_t nr_regions;
@@ -647,24 +649,30 @@ static void rh_start_recovery(struct region_hash *rh)
647 wake(rh->ms); 649 wake(rh->ms);
648} 650}
649 651
652#define MIN_READ_RECORDS 20
653struct dm_raid1_read_record {
654 struct mirror *m;
655 struct dm_bio_details details;
656};
657
650/* 658/*
651 * Every mirror should look like this one. 659 * Every mirror should look like this one.
652 */ 660 */
653#define DEFAULT_MIRROR 0 661#define DEFAULT_MIRROR 0
654 662
655/* 663/*
656 * This is yucky. We squirrel the mirror_set struct away inside 664 * This is yucky. We squirrel the mirror struct away inside
657 * bi_next for write buffers. This is safe since the bh 665 * bi_next for read/write buffers. This is safe since the bh
658 * doesn't get submitted to the lower levels of block layer. 666 * doesn't get submitted to the lower levels of block layer.
659 */ 667 */
660static struct mirror_set *bio_get_ms(struct bio *bio) 668static struct mirror *bio_get_m(struct bio *bio)
661{ 669{
662 return (struct mirror_set *) bio->bi_next; 670 return (struct mirror *) bio->bi_next;
663} 671}
664 672
665static void bio_set_ms(struct bio *bio, struct mirror_set *ms) 673static void bio_set_m(struct bio *bio, struct mirror *m)
666{ 674{
667 bio->bi_next = (struct bio *) ms; 675 bio->bi_next = (struct bio *) m;
668} 676}
669 677
670static struct mirror *get_default_mirror(struct mirror_set *ms) 678static struct mirror *get_default_mirror(struct mirror_set *ms)
@@ -857,17 +865,105 @@ static void do_recovery(struct mirror_set *ms)
857 *---------------------------------------------------------------*/ 865 *---------------------------------------------------------------*/
858static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector) 866static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector)
859{ 867{
860 /* FIXME: add read balancing */ 868 struct mirror *m = get_default_mirror(ms);
861 return get_default_mirror(ms); 869
870 do {
871 if (likely(!atomic_read(&m->error_count)))
872 return m;
873
874 if (m-- == ms->mirror)
875 m += ms->nr_mirrors;
876 } while (m != get_default_mirror(ms));
877
878 return NULL;
879}
880
881static int default_ok(struct mirror *m)
882{
883 struct mirror *default_mirror = get_default_mirror(m->ms);
884
885 return !atomic_read(&default_mirror->error_count);
886}
887
888static int mirror_available(struct mirror_set *ms, struct bio *bio)
889{
890 region_t region = bio_to_region(&ms->rh, bio);
891
892 if (ms->rh.log->type->in_sync(ms->rh.log, region, 0))
893 return choose_mirror(ms, bio->bi_sector) ? 1 : 0;
894
895 return 0;
862} 896}
863 897
864/* 898/*
865 * remap a buffer to a particular mirror. 899 * remap a buffer to a particular mirror.
866 */ 900 */
867static void map_bio(struct mirror_set *ms, struct mirror *m, struct bio *bio) 901static sector_t map_sector(struct mirror *m, struct bio *bio)
902{
903 return m->offset + (bio->bi_sector - m->ms->ti->begin);
904}
905
906static void map_bio(struct mirror *m, struct bio *bio)
868{ 907{
869 bio->bi_bdev = m->dev->bdev; 908 bio->bi_bdev = m->dev->bdev;
870 bio->bi_sector = m->offset + (bio->bi_sector - ms->ti->begin); 909 bio->bi_sector = map_sector(m, bio);
910}
911
912static void map_region(struct io_region *io, struct mirror *m,
913 struct bio *bio)
914{
915 io->bdev = m->dev->bdev;
916 io->sector = map_sector(m, bio);
917 io->count = bio->bi_size >> 9;
918}
919
920/*-----------------------------------------------------------------
921 * Reads
922 *---------------------------------------------------------------*/
923static void read_callback(unsigned long error, void *context)
924{
925 struct bio *bio = context;
926 struct mirror *m;
927
928 m = bio_get_m(bio);
929 bio_set_m(bio, NULL);
930
931 if (likely(!error)) {
932 bio_endio(bio, 0);
933 return;
934 }
935
936 fail_mirror(m, DM_RAID1_READ_ERROR);
937
938 if (likely(default_ok(m)) || mirror_available(m->ms, bio)) {
939 DMWARN_LIMIT("Read failure on mirror device %s. "
940 "Trying alternative device.",
941 m->dev->name);
942 queue_bio(m->ms, bio, bio_rw(bio));
943 return;
944 }
945
946 DMERR_LIMIT("Read failure on mirror device %s. Failing I/O.",
947 m->dev->name);
948 bio_endio(bio, -EIO);
949}
950
951/* Asynchronous read. */
952static void read_async_bio(struct mirror *m, struct bio *bio)
953{
954 struct io_region io;
955 struct dm_io_request io_req = {
956 .bi_rw = READ,
957 .mem.type = DM_IO_BVEC,
958 .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx,
959 .notify.fn = read_callback,
960 .notify.context = bio,
961 .client = m->ms->io_client,
962 };
963
964 map_region(&io, m, bio);
965 bio_set_m(bio, m);
966 (void) dm_io(&io_req, 1, &io, NULL);
871} 967}
872 968
873static void do_reads(struct mirror_set *ms, struct bio_list *reads) 969static void do_reads(struct mirror_set *ms, struct bio_list *reads)
@@ -878,17 +974,20 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads)
878 974
879 while ((bio = bio_list_pop(reads))) { 975 while ((bio = bio_list_pop(reads))) {
880 region = bio_to_region(&ms->rh, bio); 976 region = bio_to_region(&ms->rh, bio);
977 m = get_default_mirror(ms);
881 978
882 /* 979 /*
883 * We can only read balance if the region is in sync. 980 * We can only read balance if the region is in sync.
884 */ 981 */
885 if (rh_in_sync(&ms->rh, region, 1)) 982 if (likely(rh_in_sync(&ms->rh, region, 1)))
886 m = choose_mirror(ms, bio->bi_sector); 983 m = choose_mirror(ms, bio->bi_sector);
887 else 984 else if (m && atomic_read(&m->error_count))
888 m = get_default_mirror(ms); 985 m = NULL;
889 986
890 map_bio(ms, m, bio); 987 if (likely(m))
891 generic_make_request(bio); 988 read_async_bio(m, bio);
989 else
990 bio_endio(bio, -EIO);
892 } 991 }
893} 992}
894 993
@@ -964,8 +1063,8 @@ static void write_callback(unsigned long error, void *context)
964 int should_wake = 0; 1063 int should_wake = 0;
965 unsigned long flags; 1064 unsigned long flags;
966 1065
967 ms = bio_get_ms(bio); 1066 ms = bio_get_m(bio)->ms;
968 bio_set_ms(bio, NULL); 1067 bio_set_m(bio, NULL);
969 1068
970 /* 1069 /*
971 * NOTE: We don't decrement the pending count here, 1070 * NOTE: We don't decrement the pending count here,
@@ -1008,7 +1107,7 @@ out:
1008static void do_write(struct mirror_set *ms, struct bio *bio) 1107static void do_write(struct mirror_set *ms, struct bio *bio)
1009{ 1108{
1010 unsigned int i; 1109 unsigned int i;
1011 struct io_region io[KCOPYD_MAX_REGIONS+1]; 1110 struct io_region io[ms->nr_mirrors], *dest = io;
1012 struct mirror *m; 1111 struct mirror *m;
1013 struct dm_io_request io_req = { 1112 struct dm_io_request io_req = {
1014 .bi_rw = WRITE, 1113 .bi_rw = WRITE,
@@ -1019,15 +1118,14 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
1019 .client = ms->io_client, 1118 .client = ms->io_client,
1020 }; 1119 };
1021 1120
1022 for (i = 0; i < ms->nr_mirrors; i++) { 1121 for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++)
1023 m = ms->mirror + i; 1122 map_region(dest++, m, bio);
1024
1025 io[i].bdev = m->dev->bdev;
1026 io[i].sector = m->offset + (bio->bi_sector - ms->ti->begin);
1027 io[i].count = bio->bi_size >> 9;
1028 }
1029 1123
1030 bio_set_ms(bio, ms); 1124 /*
1125 * Use default mirror because we only need it to retrieve the reference
1126 * to the mirror set in write_callback().
1127 */
1128 bio_set_m(bio, get_default_mirror(ms));
1031 1129
1032 (void) dm_io(&io_req, ms->nr_mirrors, io, NULL); 1130 (void) dm_io(&io_req, ms->nr_mirrors, io, NULL);
1033} 1131}
@@ -1092,7 +1190,7 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
1092 rh_delay(&ms->rh, bio); 1190 rh_delay(&ms->rh, bio);
1093 1191
1094 while ((bio = bio_list_pop(&nosync))) { 1192 while ((bio = bio_list_pop(&nosync))) {
1095 map_bio(ms, get_default_mirror(ms), bio); 1193 map_bio(get_default_mirror(ms), bio);
1096 generic_make_request(bio); 1194 generic_make_request(bio);
1097 } 1195 }
1098} 1196}
@@ -1231,9 +1329,19 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
1231 atomic_set(&ms->suspend, 0); 1329 atomic_set(&ms->suspend, 0);
1232 atomic_set(&ms->default_mirror, DEFAULT_MIRROR); 1330 atomic_set(&ms->default_mirror, DEFAULT_MIRROR);
1233 1331
1332 len = sizeof(struct dm_raid1_read_record);
1333 ms->read_record_pool = mempool_create_kmalloc_pool(MIN_READ_RECORDS,
1334 len);
1335 if (!ms->read_record_pool) {
1336 ti->error = "Error creating mirror read_record_pool";
1337 kfree(ms);
1338 return NULL;
1339 }
1340
1234 ms->io_client = dm_io_client_create(DM_IO_PAGES); 1341 ms->io_client = dm_io_client_create(DM_IO_PAGES);
1235 if (IS_ERR(ms->io_client)) { 1342 if (IS_ERR(ms->io_client)) {
1236 ti->error = "Error creating dm_io client"; 1343 ti->error = "Error creating dm_io client";
1344 mempool_destroy(ms->read_record_pool);
1237 kfree(ms); 1345 kfree(ms);
1238 return NULL; 1346 return NULL;
1239 } 1347 }
@@ -1241,6 +1349,7 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
1241 if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) { 1349 if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) {
1242 ti->error = "Error creating dirty region hash"; 1350 ti->error = "Error creating dirty region hash";
1243 dm_io_client_destroy(ms->io_client); 1351 dm_io_client_destroy(ms->io_client);
1352 mempool_destroy(ms->read_record_pool);
1244 kfree(ms); 1353 kfree(ms);
1245 return NULL; 1354 return NULL;
1246 } 1355 }
@@ -1256,6 +1365,7 @@ static void free_context(struct mirror_set *ms, struct dm_target *ti,
1256 1365
1257 dm_io_client_destroy(ms->io_client); 1366 dm_io_client_destroy(ms->io_client);
1258 rh_exit(&ms->rh); 1367 rh_exit(&ms->rh);
1368 mempool_destroy(ms->read_record_pool);
1259 kfree(ms); 1369 kfree(ms);
1260} 1370}
1261 1371
@@ -1510,10 +1620,11 @@ static int mirror_map(struct dm_target *ti, struct bio *bio,
1510 int r, rw = bio_rw(bio); 1620 int r, rw = bio_rw(bio);
1511 struct mirror *m; 1621 struct mirror *m;
1512 struct mirror_set *ms = ti->private; 1622 struct mirror_set *ms = ti->private;
1513 1623 struct dm_raid1_read_record *read_record = NULL;
1514 map_context->ll = bio_to_region(&ms->rh, bio);
1515 1624
1516 if (rw == WRITE) { 1625 if (rw == WRITE) {
1626 /* Save region for mirror_end_io() handler */
1627 map_context->ll = bio_to_region(&ms->rh, bio);
1517 queue_bio(ms, bio, rw); 1628 queue_bio(ms, bio, rw);
1518 return DM_MAPIO_SUBMITTED; 1629 return DM_MAPIO_SUBMITTED;
1519 } 1630 }
@@ -1523,28 +1634,34 @@ static int mirror_map(struct dm_target *ti, struct bio *bio,
1523 if (r < 0 && r != -EWOULDBLOCK) 1634 if (r < 0 && r != -EWOULDBLOCK)
1524 return r; 1635 return r;
1525 1636
1526 if (r == -EWOULDBLOCK) /* FIXME: ugly */
1527 r = DM_MAPIO_SUBMITTED;
1528
1529 /* 1637 /*
1530 * We don't want to fast track a recovery just for a read 1638 * If region is not in-sync queue the bio.
1531 * ahead. So we just let it silently fail.
1532 * FIXME: get rid of this.
1533 */ 1639 */
1534 if (!r && rw == READA) 1640 if (!r || (r == -EWOULDBLOCK)) {
1535 return -EIO; 1641 if (rw == READA)
1642 return -EWOULDBLOCK;
1536 1643
1537 if (!r) {
1538 /* Pass this io over to the daemon */
1539 queue_bio(ms, bio, rw); 1644 queue_bio(ms, bio, rw);
1540 return DM_MAPIO_SUBMITTED; 1645 return DM_MAPIO_SUBMITTED;
1541 } 1646 }
1542 1647
1648 /*
1649 * The region is in-sync and we can perform reads directly.
1650 * Store enough information so we can retry if it fails.
1651 */
1543 m = choose_mirror(ms, bio->bi_sector); 1652 m = choose_mirror(ms, bio->bi_sector);
1544 if (!m) 1653 if (unlikely(!m))
1545 return -EIO; 1654 return -EIO;
1546 1655
1547 map_bio(ms, m, bio); 1656 read_record = mempool_alloc(ms->read_record_pool, GFP_NOIO);
1657 if (likely(read_record)) {
1658 dm_bio_record(&read_record->details, bio);
1659 map_context->ptr = read_record;
1660 read_record->m = m;
1661 }
1662
1663 map_bio(m, bio);
1664
1548 return DM_MAPIO_REMAPPED; 1665 return DM_MAPIO_REMAPPED;
1549} 1666}
1550 1667
@@ -1553,15 +1670,64 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
1553{ 1670{
1554 int rw = bio_rw(bio); 1671 int rw = bio_rw(bio);
1555 struct mirror_set *ms = (struct mirror_set *) ti->private; 1672 struct mirror_set *ms = (struct mirror_set *) ti->private;
1556 region_t region = map_context->ll; 1673 struct mirror *m = NULL;
1674 struct dm_bio_details *bd = NULL;
1675 struct dm_raid1_read_record *read_record = map_context->ptr;
1557 1676
1558 /* 1677 /*
1559 * We need to dec pending if this was a write. 1678 * We need to dec pending if this was a write.
1560 */ 1679 */
1561 if (rw == WRITE) 1680 if (rw == WRITE) {
1562 rh_dec(&ms->rh, region); 1681 rh_dec(&ms->rh, map_context->ll);
1682 return error;
1683 }
1563 1684
1564 return 0; 1685 if (error == -EOPNOTSUPP)
1686 goto out;
1687
1688 if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio))
1689 goto out;
1690
1691 if (unlikely(error)) {
1692 if (!read_record) {
1693 /*
1694 * There wasn't enough memory to record necessary
1695 * information for a retry or there was no other
1696 * mirror in-sync.
1697 */
1698 DMERR_LIMIT("Mirror read failed from %s.",
1699 m->dev->name);
1700 return -EIO;
1701 }
1702 DMERR("Mirror read failed from %s. Trying alternative device.",
1703 m->dev->name);
1704
1705 m = read_record->m;
1706 fail_mirror(m, DM_RAID1_READ_ERROR);
1707
1708 /*
1709 * A failed read is requeued for another attempt using an intact
1710 * mirror.
1711 */
1712 if (default_ok(m) || mirror_available(ms, bio)) {
1713 bd = &read_record->details;
1714
1715 dm_bio_restore(bd, bio);
1716 mempool_free(read_record, ms->read_record_pool);
1717 map_context->ptr = NULL;
1718 queue_bio(ms, bio, rw);
1719 return 1;
1720 }
1721 DMERR("All replicated volumes dead, failing I/O");
1722 }
1723
1724out:
1725 if (read_record) {
1726 mempool_free(read_record, ms->read_record_pool);
1727 map_context->ptr = NULL;
1728 }
1729
1730 return error;
1565} 1731}
1566 1732
1567static void mirror_presuspend(struct dm_target *ti) 1733static void mirror_presuspend(struct dm_target *ti)