diff options
-rw-r--r-- | drivers/md/dm-raid1.c | 256 |
1 files changed, 211 insertions, 45 deletions
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index ec6d675bf766..38efa7071dd7 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c | |||
@@ -6,6 +6,7 @@ | |||
6 | 6 | ||
7 | #include "dm.h" | 7 | #include "dm.h" |
8 | #include "dm-bio-list.h" | 8 | #include "dm-bio-list.h" |
9 | #include "dm-bio-record.h" | ||
9 | #include "dm-io.h" | 10 | #include "dm-io.h" |
10 | #include "dm-log.h" | 11 | #include "dm-log.h" |
11 | #include "kcopyd.h" | 12 | #include "kcopyd.h" |
@@ -141,6 +142,7 @@ struct mirror_set { | |||
141 | struct bio_list failures; | 142 | struct bio_list failures; |
142 | 143 | ||
143 | struct dm_io_client *io_client; | 144 | struct dm_io_client *io_client; |
145 | mempool_t *read_record_pool; | ||
144 | 146 | ||
145 | /* recovery */ | 147 | /* recovery */ |
146 | region_t nr_regions; | 148 | region_t nr_regions; |
@@ -647,24 +649,30 @@ static void rh_start_recovery(struct region_hash *rh) | |||
647 | wake(rh->ms); | 649 | wake(rh->ms); |
648 | } | 650 | } |
649 | 651 | ||
652 | #define MIN_READ_RECORDS 20 | ||
653 | struct dm_raid1_read_record { | ||
654 | struct mirror *m; | ||
655 | struct dm_bio_details details; | ||
656 | }; | ||
657 | |||
650 | /* | 658 | /* |
651 | * Every mirror should look like this one. | 659 | * Every mirror should look like this one. |
652 | */ | 660 | */ |
653 | #define DEFAULT_MIRROR 0 | 661 | #define DEFAULT_MIRROR 0 |
654 | 662 | ||
655 | /* | 663 | /* |
656 | * This is yucky. We squirrel the mirror_set struct away inside | 664 | * This is yucky. We squirrel the mirror struct away inside |
657 | * bi_next for write buffers. This is safe since the bh | 665 | * bi_next for read/write buffers. This is safe since the bh |
658 | * doesn't get submitted to the lower levels of block layer. | 666 | * doesn't get submitted to the lower levels of block layer. |
659 | */ | 667 | */ |
660 | static struct mirror_set *bio_get_ms(struct bio *bio) | 668 | static struct mirror *bio_get_m(struct bio *bio) |
661 | { | 669 | { |
662 | return (struct mirror_set *) bio->bi_next; | 670 | return (struct mirror *) bio->bi_next; |
663 | } | 671 | } |
664 | 672 | ||
665 | static void bio_set_ms(struct bio *bio, struct mirror_set *ms) | 673 | static void bio_set_m(struct bio *bio, struct mirror *m) |
666 | { | 674 | { |
667 | bio->bi_next = (struct bio *) ms; | 675 | bio->bi_next = (struct bio *) m; |
668 | } | 676 | } |
669 | 677 | ||
670 | static struct mirror *get_default_mirror(struct mirror_set *ms) | 678 | static struct mirror *get_default_mirror(struct mirror_set *ms) |
@@ -857,17 +865,105 @@ static void do_recovery(struct mirror_set *ms) | |||
857 | *---------------------------------------------------------------*/ | 865 | *---------------------------------------------------------------*/ |
858 | static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector) | 866 | static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector) |
859 | { | 867 | { |
860 | /* FIXME: add read balancing */ | 868 | struct mirror *m = get_default_mirror(ms); |
861 | return get_default_mirror(ms); | 869 | |
870 | do { | ||
871 | if (likely(!atomic_read(&m->error_count))) | ||
872 | return m; | ||
873 | |||
874 | if (m-- == ms->mirror) | ||
875 | m += ms->nr_mirrors; | ||
876 | } while (m != get_default_mirror(ms)); | ||
877 | |||
878 | return NULL; | ||
879 | } | ||
880 | |||
881 | static int default_ok(struct mirror *m) | ||
882 | { | ||
883 | struct mirror *default_mirror = get_default_mirror(m->ms); | ||
884 | |||
885 | return !atomic_read(&default_mirror->error_count); | ||
886 | } | ||
887 | |||
888 | static int mirror_available(struct mirror_set *ms, struct bio *bio) | ||
889 | { | ||
890 | region_t region = bio_to_region(&ms->rh, bio); | ||
891 | |||
892 | if (ms->rh.log->type->in_sync(ms->rh.log, region, 0)) | ||
893 | return choose_mirror(ms, bio->bi_sector) ? 1 : 0; | ||
894 | |||
895 | return 0; | ||
862 | } | 896 | } |
863 | 897 | ||
864 | /* | 898 | /* |
865 | * remap a buffer to a particular mirror. | 899 | * remap a buffer to a particular mirror. |
866 | */ | 900 | */ |
867 | static void map_bio(struct mirror_set *ms, struct mirror *m, struct bio *bio) | 901 | static sector_t map_sector(struct mirror *m, struct bio *bio) |
902 | { | ||
903 | return m->offset + (bio->bi_sector - m->ms->ti->begin); | ||
904 | } | ||
905 | |||
906 | static void map_bio(struct mirror *m, struct bio *bio) | ||
868 | { | 907 | { |
869 | bio->bi_bdev = m->dev->bdev; | 908 | bio->bi_bdev = m->dev->bdev; |
870 | bio->bi_sector = m->offset + (bio->bi_sector - ms->ti->begin); | 909 | bio->bi_sector = map_sector(m, bio); |
910 | } | ||
911 | |||
912 | static void map_region(struct io_region *io, struct mirror *m, | ||
913 | struct bio *bio) | ||
914 | { | ||
915 | io->bdev = m->dev->bdev; | ||
916 | io->sector = map_sector(m, bio); | ||
917 | io->count = bio->bi_size >> 9; | ||
918 | } | ||
919 | |||
920 | /*----------------------------------------------------------------- | ||
921 | * Reads | ||
922 | *---------------------------------------------------------------*/ | ||
923 | static void read_callback(unsigned long error, void *context) | ||
924 | { | ||
925 | struct bio *bio = context; | ||
926 | struct mirror *m; | ||
927 | |||
928 | m = bio_get_m(bio); | ||
929 | bio_set_m(bio, NULL); | ||
930 | |||
931 | if (likely(!error)) { | ||
932 | bio_endio(bio, 0); | ||
933 | return; | ||
934 | } | ||
935 | |||
936 | fail_mirror(m, DM_RAID1_READ_ERROR); | ||
937 | |||
938 | if (likely(default_ok(m)) || mirror_available(m->ms, bio)) { | ||
939 | DMWARN_LIMIT("Read failure on mirror device %s. " | ||
940 | "Trying alternative device.", | ||
941 | m->dev->name); | ||
942 | queue_bio(m->ms, bio, bio_rw(bio)); | ||
943 | return; | ||
944 | } | ||
945 | |||
946 | DMERR_LIMIT("Read failure on mirror device %s. Failing I/O.", | ||
947 | m->dev->name); | ||
948 | bio_endio(bio, -EIO); | ||
949 | } | ||
950 | |||
951 | /* Asynchronous read. */ | ||
952 | static void read_async_bio(struct mirror *m, struct bio *bio) | ||
953 | { | ||
954 | struct io_region io; | ||
955 | struct dm_io_request io_req = { | ||
956 | .bi_rw = READ, | ||
957 | .mem.type = DM_IO_BVEC, | ||
958 | .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx, | ||
959 | .notify.fn = read_callback, | ||
960 | .notify.context = bio, | ||
961 | .client = m->ms->io_client, | ||
962 | }; | ||
963 | |||
964 | map_region(&io, m, bio); | ||
965 | bio_set_m(bio, m); | ||
966 | (void) dm_io(&io_req, 1, &io, NULL); | ||
871 | } | 967 | } |
872 | 968 | ||
873 | static void do_reads(struct mirror_set *ms, struct bio_list *reads) | 969 | static void do_reads(struct mirror_set *ms, struct bio_list *reads) |
@@ -878,17 +974,20 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads) | |||
878 | 974 | ||
879 | while ((bio = bio_list_pop(reads))) { | 975 | while ((bio = bio_list_pop(reads))) { |
880 | region = bio_to_region(&ms->rh, bio); | 976 | region = bio_to_region(&ms->rh, bio); |
977 | m = get_default_mirror(ms); | ||
881 | 978 | ||
882 | /* | 979 | /* |
883 | * We can only read balance if the region is in sync. | 980 | * We can only read balance if the region is in sync. |
884 | */ | 981 | */ |
885 | if (rh_in_sync(&ms->rh, region, 1)) | 982 | if (likely(rh_in_sync(&ms->rh, region, 1))) |
886 | m = choose_mirror(ms, bio->bi_sector); | 983 | m = choose_mirror(ms, bio->bi_sector); |
887 | else | 984 | else if (m && atomic_read(&m->error_count)) |
888 | m = get_default_mirror(ms); | 985 | m = NULL; |
889 | 986 | ||
890 | map_bio(ms, m, bio); | 987 | if (likely(m)) |
891 | generic_make_request(bio); | 988 | read_async_bio(m, bio); |
989 | else | ||
990 | bio_endio(bio, -EIO); | ||
892 | } | 991 | } |
893 | } | 992 | } |
894 | 993 | ||
@@ -964,8 +1063,8 @@ static void write_callback(unsigned long error, void *context) | |||
964 | int should_wake = 0; | 1063 | int should_wake = 0; |
965 | unsigned long flags; | 1064 | unsigned long flags; |
966 | 1065 | ||
967 | ms = bio_get_ms(bio); | 1066 | ms = bio_get_m(bio)->ms; |
968 | bio_set_ms(bio, NULL); | 1067 | bio_set_m(bio, NULL); |
969 | 1068 | ||
970 | /* | 1069 | /* |
971 | * NOTE: We don't decrement the pending count here, | 1070 | * NOTE: We don't decrement the pending count here, |
@@ -1008,7 +1107,7 @@ out: | |||
1008 | static void do_write(struct mirror_set *ms, struct bio *bio) | 1107 | static void do_write(struct mirror_set *ms, struct bio *bio) |
1009 | { | 1108 | { |
1010 | unsigned int i; | 1109 | unsigned int i; |
1011 | struct io_region io[KCOPYD_MAX_REGIONS+1]; | 1110 | struct io_region io[ms->nr_mirrors], *dest = io; |
1012 | struct mirror *m; | 1111 | struct mirror *m; |
1013 | struct dm_io_request io_req = { | 1112 | struct dm_io_request io_req = { |
1014 | .bi_rw = WRITE, | 1113 | .bi_rw = WRITE, |
@@ -1019,15 +1118,14 @@ static void do_write(struct mirror_set *ms, struct bio *bio) | |||
1019 | .client = ms->io_client, | 1118 | .client = ms->io_client, |
1020 | }; | 1119 | }; |
1021 | 1120 | ||
1022 | for (i = 0; i < ms->nr_mirrors; i++) { | 1121 | for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++) |
1023 | m = ms->mirror + i; | 1122 | map_region(dest++, m, bio); |
1024 | |||
1025 | io[i].bdev = m->dev->bdev; | ||
1026 | io[i].sector = m->offset + (bio->bi_sector - ms->ti->begin); | ||
1027 | io[i].count = bio->bi_size >> 9; | ||
1028 | } | ||
1029 | 1123 | ||
1030 | bio_set_ms(bio, ms); | 1124 | /* |
1125 | * Use default mirror because we only need it to retrieve the reference | ||
1126 | * to the mirror set in write_callback(). | ||
1127 | */ | ||
1128 | bio_set_m(bio, get_default_mirror(ms)); | ||
1031 | 1129 | ||
1032 | (void) dm_io(&io_req, ms->nr_mirrors, io, NULL); | 1130 | (void) dm_io(&io_req, ms->nr_mirrors, io, NULL); |
1033 | } | 1131 | } |
@@ -1092,7 +1190,7 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) | |||
1092 | rh_delay(&ms->rh, bio); | 1190 | rh_delay(&ms->rh, bio); |
1093 | 1191 | ||
1094 | while ((bio = bio_list_pop(&nosync))) { | 1192 | while ((bio = bio_list_pop(&nosync))) { |
1095 | map_bio(ms, get_default_mirror(ms), bio); | 1193 | map_bio(get_default_mirror(ms), bio); |
1096 | generic_make_request(bio); | 1194 | generic_make_request(bio); |
1097 | } | 1195 | } |
1098 | } | 1196 | } |
@@ -1231,9 +1329,19 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors, | |||
1231 | atomic_set(&ms->suspend, 0); | 1329 | atomic_set(&ms->suspend, 0); |
1232 | atomic_set(&ms->default_mirror, DEFAULT_MIRROR); | 1330 | atomic_set(&ms->default_mirror, DEFAULT_MIRROR); |
1233 | 1331 | ||
1332 | len = sizeof(struct dm_raid1_read_record); | ||
1333 | ms->read_record_pool = mempool_create_kmalloc_pool(MIN_READ_RECORDS, | ||
1334 | len); | ||
1335 | if (!ms->read_record_pool) { | ||
1336 | ti->error = "Error creating mirror read_record_pool"; | ||
1337 | kfree(ms); | ||
1338 | return NULL; | ||
1339 | } | ||
1340 | |||
1234 | ms->io_client = dm_io_client_create(DM_IO_PAGES); | 1341 | ms->io_client = dm_io_client_create(DM_IO_PAGES); |
1235 | if (IS_ERR(ms->io_client)) { | 1342 | if (IS_ERR(ms->io_client)) { |
1236 | ti->error = "Error creating dm_io client"; | 1343 | ti->error = "Error creating dm_io client"; |
1344 | mempool_destroy(ms->read_record_pool); | ||
1237 | kfree(ms); | 1345 | kfree(ms); |
1238 | return NULL; | 1346 | return NULL; |
1239 | } | 1347 | } |
@@ -1241,6 +1349,7 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors, | |||
1241 | if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) { | 1349 | if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) { |
1242 | ti->error = "Error creating dirty region hash"; | 1350 | ti->error = "Error creating dirty region hash"; |
1243 | dm_io_client_destroy(ms->io_client); | 1351 | dm_io_client_destroy(ms->io_client); |
1352 | mempool_destroy(ms->read_record_pool); | ||
1244 | kfree(ms); | 1353 | kfree(ms); |
1245 | return NULL; | 1354 | return NULL; |
1246 | } | 1355 | } |
@@ -1256,6 +1365,7 @@ static void free_context(struct mirror_set *ms, struct dm_target *ti, | |||
1256 | 1365 | ||
1257 | dm_io_client_destroy(ms->io_client); | 1366 | dm_io_client_destroy(ms->io_client); |
1258 | rh_exit(&ms->rh); | 1367 | rh_exit(&ms->rh); |
1368 | mempool_destroy(ms->read_record_pool); | ||
1259 | kfree(ms); | 1369 | kfree(ms); |
1260 | } | 1370 | } |
1261 | 1371 | ||
@@ -1510,10 +1620,11 @@ static int mirror_map(struct dm_target *ti, struct bio *bio, | |||
1510 | int r, rw = bio_rw(bio); | 1620 | int r, rw = bio_rw(bio); |
1511 | struct mirror *m; | 1621 | struct mirror *m; |
1512 | struct mirror_set *ms = ti->private; | 1622 | struct mirror_set *ms = ti->private; |
1513 | 1623 | struct dm_raid1_read_record *read_record = NULL; | |
1514 | map_context->ll = bio_to_region(&ms->rh, bio); | ||
1515 | 1624 | ||
1516 | if (rw == WRITE) { | 1625 | if (rw == WRITE) { |
1626 | /* Save region for mirror_end_io() handler */ | ||
1627 | map_context->ll = bio_to_region(&ms->rh, bio); | ||
1517 | queue_bio(ms, bio, rw); | 1628 | queue_bio(ms, bio, rw); |
1518 | return DM_MAPIO_SUBMITTED; | 1629 | return DM_MAPIO_SUBMITTED; |
1519 | } | 1630 | } |
@@ -1523,28 +1634,34 @@ static int mirror_map(struct dm_target *ti, struct bio *bio, | |||
1523 | if (r < 0 && r != -EWOULDBLOCK) | 1634 | if (r < 0 && r != -EWOULDBLOCK) |
1524 | return r; | 1635 | return r; |
1525 | 1636 | ||
1526 | if (r == -EWOULDBLOCK) /* FIXME: ugly */ | ||
1527 | r = DM_MAPIO_SUBMITTED; | ||
1528 | |||
1529 | /* | 1637 | /* |
1530 | * We don't want to fast track a recovery just for a read | 1638 | * If region is not in-sync queue the bio. |
1531 | * ahead. So we just let it silently fail. | ||
1532 | * FIXME: get rid of this. | ||
1533 | */ | 1639 | */ |
1534 | if (!r && rw == READA) | 1640 | if (!r || (r == -EWOULDBLOCK)) { |
1535 | return -EIO; | 1641 | if (rw == READA) |
1642 | return -EWOULDBLOCK; | ||
1536 | 1643 | ||
1537 | if (!r) { | ||
1538 | /* Pass this io over to the daemon */ | ||
1539 | queue_bio(ms, bio, rw); | 1644 | queue_bio(ms, bio, rw); |
1540 | return DM_MAPIO_SUBMITTED; | 1645 | return DM_MAPIO_SUBMITTED; |
1541 | } | 1646 | } |
1542 | 1647 | ||
1648 | /* | ||
1649 | * The region is in-sync and we can perform reads directly. | ||
1650 | * Store enough information so we can retry if it fails. | ||
1651 | */ | ||
1543 | m = choose_mirror(ms, bio->bi_sector); | 1652 | m = choose_mirror(ms, bio->bi_sector); |
1544 | if (!m) | 1653 | if (unlikely(!m)) |
1545 | return -EIO; | 1654 | return -EIO; |
1546 | 1655 | ||
1547 | map_bio(ms, m, bio); | 1656 | read_record = mempool_alloc(ms->read_record_pool, GFP_NOIO); |
1657 | if (likely(read_record)) { | ||
1658 | dm_bio_record(&read_record->details, bio); | ||
1659 | map_context->ptr = read_record; | ||
1660 | read_record->m = m; | ||
1661 | } | ||
1662 | |||
1663 | map_bio(m, bio); | ||
1664 | |||
1548 | return DM_MAPIO_REMAPPED; | 1665 | return DM_MAPIO_REMAPPED; |
1549 | } | 1666 | } |
1550 | 1667 | ||
@@ -1553,15 +1670,64 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, | |||
1553 | { | 1670 | { |
1554 | int rw = bio_rw(bio); | 1671 | int rw = bio_rw(bio); |
1555 | struct mirror_set *ms = (struct mirror_set *) ti->private; | 1672 | struct mirror_set *ms = (struct mirror_set *) ti->private; |
1556 | region_t region = map_context->ll; | 1673 | struct mirror *m = NULL; |
1674 | struct dm_bio_details *bd = NULL; | ||
1675 | struct dm_raid1_read_record *read_record = map_context->ptr; | ||
1557 | 1676 | ||
1558 | /* | 1677 | /* |
1559 | * We need to dec pending if this was a write. | 1678 | * We need to dec pending if this was a write. |
1560 | */ | 1679 | */ |
1561 | if (rw == WRITE) | 1680 | if (rw == WRITE) { |
1562 | rh_dec(&ms->rh, region); | 1681 | rh_dec(&ms->rh, map_context->ll); |
1682 | return error; | ||
1683 | } | ||
1563 | 1684 | ||
1564 | return 0; | 1685 | if (error == -EOPNOTSUPP) |
1686 | goto out; | ||
1687 | |||
1688 | if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio)) | ||
1689 | goto out; | ||
1690 | |||
1691 | if (unlikely(error)) { | ||
1692 | if (!read_record) { | ||
1693 | /* | ||
1694 | * There wasn't enough memory to record necessary | ||
1695 | * information for a retry or there was no other | ||
1696 | * mirror in-sync. | ||
1697 | */ | ||
1698 | DMERR_LIMIT("Mirror read failed from %s.", | ||
1699 | m->dev->name); | ||
1700 | return -EIO; | ||
1701 | } | ||
1702 | DMERR("Mirror read failed from %s. Trying alternative device.", | ||
1703 | m->dev->name); | ||
1704 | |||
1705 | m = read_record->m; | ||
1706 | fail_mirror(m, DM_RAID1_READ_ERROR); | ||
1707 | |||
1708 | /* | ||
1709 | * A failed read is requeued for another attempt using an intact | ||
1710 | * mirror. | ||
1711 | */ | ||
1712 | if (default_ok(m) || mirror_available(ms, bio)) { | ||
1713 | bd = &read_record->details; | ||
1714 | |||
1715 | dm_bio_restore(bd, bio); | ||
1716 | mempool_free(read_record, ms->read_record_pool); | ||
1717 | map_context->ptr = NULL; | ||
1718 | queue_bio(ms, bio, rw); | ||
1719 | return 1; | ||
1720 | } | ||
1721 | DMERR("All replicated volumes dead, failing I/O"); | ||
1722 | } | ||
1723 | |||
1724 | out: | ||
1725 | if (read_record) { | ||
1726 | mempool_free(read_record, ms->read_record_pool); | ||
1727 | map_context->ptr = NULL; | ||
1728 | } | ||
1729 | |||
1730 | return error; | ||
1565 | } | 1731 | } |
1566 | 1732 | ||
1567 | static void mirror_presuspend(struct dm_target *ti) | 1733 | static void mirror_presuspend(struct dm_target *ti) |