aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNeilBrown <neilb@cse.unsw.edu.au>2005-09-09 19:23:55 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2005-09-09 19:39:12 -0400
commit934ce7c840992a771ffc478b132092db9c935c42 (patch)
tree7eeacbe7d20ec9eaf80eca793dbb45a0670b18ef
parent72626685dc66d455742a7f215a0535c551628b9e (diff)
[PATCH] md: write-intent bitmap support for raid6
This is a direct port of the raid5 patch. Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--drivers/md/md.c2
-rw-r--r--drivers/md/raid6main.c133
2 files changed, 124 insertions, 11 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index dbf540a7fccc..008149e2bc4a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -645,7 +645,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
645 645
646 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 646 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
647 mddev->bitmap_file == NULL) { 647 mddev->bitmap_file == NULL) {
648 if (mddev->level != 1 && mddev->level != 5) { 648 if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6) {
649 /* FIXME use a better test */ 649 /* FIXME use a better test */
650 printk(KERN_WARNING "md: bitmaps only support for raid1\n"); 650 printk(KERN_WARNING "md: bitmaps only support for raid1\n");
651 return -EINVAL; 651 return -EINVAL;
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index 09cb7272c09f..267eb1430c83 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -29,6 +29,8 @@
29#include <asm/atomic.h> 29#include <asm/atomic.h>
30#include "raid6.h" 30#include "raid6.h"
31 31
32#include <linux/raid/bitmap.h>
33
32/* 34/*
33 * Stripe cache 35 * Stripe cache
34 */ 36 */
@@ -98,8 +100,13 @@ static inline void __release_stripe(raid6_conf_t *conf, struct stripe_head *sh)
98 if (test_bit(STRIPE_HANDLE, &sh->state)) { 100 if (test_bit(STRIPE_HANDLE, &sh->state)) {
99 if (test_bit(STRIPE_DELAYED, &sh->state)) 101 if (test_bit(STRIPE_DELAYED, &sh->state))
100 list_add_tail(&sh->lru, &conf->delayed_list); 102 list_add_tail(&sh->lru, &conf->delayed_list);
101 else 103 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
104 conf->seq_write == sh->bm_seq)
105 list_add_tail(&sh->lru, &conf->bitmap_list);
106 else {
107 clear_bit(STRIPE_BIT_DELAY, &sh->state);
102 list_add_tail(&sh->lru, &conf->handle_list); 108 list_add_tail(&sh->lru, &conf->handle_list);
109 }
103 md_wakeup_thread(conf->mddev->thread); 110 md_wakeup_thread(conf->mddev->thread);
104 } else { 111 } else {
105 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 112 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
@@ -262,6 +269,9 @@ static struct stripe_head *get_active_stripe(raid6_conf_t *conf, sector_t sector
262 spin_lock_irq(&conf->device_lock); 269 spin_lock_irq(&conf->device_lock);
263 270
264 do { 271 do {
272 wait_event_lock_irq(conf->wait_for_stripe,
273 conf->quiesce == 0,
274 conf->device_lock, /* nothing */);
265 sh = __find_stripe(conf, sector); 275 sh = __find_stripe(conf, sector);
266 if (!sh) { 276 if (!sh) {
267 if (!conf->inactive_blocked) 277 if (!conf->inactive_blocked)
@@ -906,6 +916,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
906{ 916{
907 struct bio **bip; 917 struct bio **bip;
908 raid6_conf_t *conf = sh->raid_conf; 918 raid6_conf_t *conf = sh->raid_conf;
919 int firstwrite=0;
909 920
910 PRINTK("adding bh b#%llu to stripe s#%llu\n", 921 PRINTK("adding bh b#%llu to stripe s#%llu\n",
911 (unsigned long long)bi->bi_sector, 922 (unsigned long long)bi->bi_sector,
@@ -914,9 +925,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
914 925
915 spin_lock(&sh->lock); 926 spin_lock(&sh->lock);
916 spin_lock_irq(&conf->device_lock); 927 spin_lock_irq(&conf->device_lock);
917 if (forwrite) 928 if (forwrite) {
918 bip = &sh->dev[dd_idx].towrite; 929 bip = &sh->dev[dd_idx].towrite;
919 else 930 if (*bip == NULL && sh->dev[dd_idx].written == NULL)
931 firstwrite = 1;
932 } else
920 bip = &sh->dev[dd_idx].toread; 933 bip = &sh->dev[dd_idx].toread;
921 while (*bip && (*bip)->bi_sector < bi->bi_sector) { 934 while (*bip && (*bip)->bi_sector < bi->bi_sector) {
922 if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector) 935 if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector)
@@ -939,6 +952,13 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
939 (unsigned long long)bi->bi_sector, 952 (unsigned long long)bi->bi_sector,
940 (unsigned long long)sh->sector, dd_idx); 953 (unsigned long long)sh->sector, dd_idx);
941 954
955 if (conf->mddev->bitmap && firstwrite) {
956 sh->bm_seq = conf->seq_write;
957 bitmap_startwrite(conf->mddev->bitmap, sh->sector,
958 STRIPE_SECTORS, 0);
959 set_bit(STRIPE_BIT_DELAY, &sh->state);
960 }
961
942 if (forwrite) { 962 if (forwrite) {
943 /* check if page is covered */ 963 /* check if page is covered */
944 sector_t sector = sh->dev[dd_idx].sector; 964 sector_t sector = sh->dev[dd_idx].sector;
@@ -1066,12 +1086,13 @@ static void handle_stripe(struct stripe_head *sh)
1066 * need to be failed 1086 * need to be failed
1067 */ 1087 */
1068 if (failed > 2 && to_read+to_write+written) { 1088 if (failed > 2 && to_read+to_write+written) {
1069 spin_lock_irq(&conf->device_lock);
1070 for (i=disks; i--; ) { 1089 for (i=disks; i--; ) {
1090 int bitmap_end = 0;
1091 spin_lock_irq(&conf->device_lock);
1071 /* fail all writes first */ 1092 /* fail all writes first */
1072 bi = sh->dev[i].towrite; 1093 bi = sh->dev[i].towrite;
1073 sh->dev[i].towrite = NULL; 1094 sh->dev[i].towrite = NULL;
1074 if (bi) to_write--; 1095 if (bi) { to_write--; bitmap_end = 1; }
1075 1096
1076 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 1097 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1077 wake_up(&conf->wait_for_overlap); 1098 wake_up(&conf->wait_for_overlap);
@@ -1089,6 +1110,7 @@ static void handle_stripe(struct stripe_head *sh)
1089 /* and fail all 'written' */ 1110 /* and fail all 'written' */
1090 bi = sh->dev[i].written; 1111 bi = sh->dev[i].written;
1091 sh->dev[i].written = NULL; 1112 sh->dev[i].written = NULL;
1113 if (bi) bitmap_end = 1;
1092 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { 1114 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) {
1093 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); 1115 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
1094 clear_bit(BIO_UPTODATE, &bi->bi_flags); 1116 clear_bit(BIO_UPTODATE, &bi->bi_flags);
@@ -1117,8 +1139,11 @@ static void handle_stripe(struct stripe_head *sh)
1117 bi = nextbi; 1139 bi = nextbi;
1118 } 1140 }
1119 } 1141 }
1142 spin_unlock_irq(&conf->device_lock);
1143 if (bitmap_end)
1144 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
1145 STRIPE_SECTORS, 0, 0);
1120 } 1146 }
1121 spin_unlock_irq(&conf->device_lock);
1122 } 1147 }
1123 if (failed > 2 && syncing) { 1148 if (failed > 2 && syncing) {
1124 md_done_sync(conf->mddev, STRIPE_SECTORS,0); 1149 md_done_sync(conf->mddev, STRIPE_SECTORS,0);
@@ -1155,6 +1180,7 @@ static void handle_stripe(struct stripe_head *sh)
1155 if (!test_bit(R5_LOCKED, &dev->flags) && 1180 if (!test_bit(R5_LOCKED, &dev->flags) &&
1156 test_bit(R5_UPTODATE, &dev->flags) ) { 1181 test_bit(R5_UPTODATE, &dev->flags) ) {
1157 /* We can return any write requests */ 1182 /* We can return any write requests */
1183 int bitmap_end = 0;
1158 struct bio *wbi, *wbi2; 1184 struct bio *wbi, *wbi2;
1159 PRINTK("Return write for stripe %llu disc %d\n", 1185 PRINTK("Return write for stripe %llu disc %d\n",
1160 (unsigned long long)sh->sector, i); 1186 (unsigned long long)sh->sector, i);
@@ -1170,7 +1196,13 @@ static void handle_stripe(struct stripe_head *sh)
1170 } 1196 }
1171 wbi = wbi2; 1197 wbi = wbi2;
1172 } 1198 }
1199 if (dev->towrite == NULL)
1200 bitmap_end = 1;
1173 spin_unlock_irq(&conf->device_lock); 1201 spin_unlock_irq(&conf->device_lock);
1202 if (bitmap_end)
1203 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
1204 STRIPE_SECTORS,
1205 !test_bit(STRIPE_DEGRADED, &sh->state), 0);
1174 } 1206 }
1175 } 1207 }
1176 } 1208 }
@@ -1285,7 +1317,8 @@ static void handle_stripe(struct stripe_head *sh)
1285 } 1317 }
1286 } 1318 }
1287 /* now if nothing is locked, and if we have enough data, we can start a write request */ 1319 /* now if nothing is locked, and if we have enough data, we can start a write request */
1288 if (locked == 0 && rcw == 0) { 1320 if (locked == 0 && rcw == 0 &&
1321 !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
1289 if ( must_compute > 0 ) { 1322 if ( must_compute > 0 ) {
1290 /* We have failed blocks and need to compute them */ 1323 /* We have failed blocks and need to compute them */
1291 switch ( failed ) { 1324 switch ( failed ) {
@@ -1388,6 +1421,7 @@ static void handle_stripe(struct stripe_head *sh)
1388 bdev = &sh->dev[failed_num[1]]; 1421 bdev = &sh->dev[failed_num[1]];
1389 locked += !test_bit(R5_LOCKED, &bdev->flags); 1422 locked += !test_bit(R5_LOCKED, &bdev->flags);
1390 set_bit(R5_LOCKED, &bdev->flags); 1423 set_bit(R5_LOCKED, &bdev->flags);
1424 clear_bit(STRIPE_DEGRADED, &sh->state);
1391 set_bit(R5_Wantwrite, &bdev->flags); 1425 set_bit(R5_Wantwrite, &bdev->flags);
1392 1426
1393 set_bit(STRIPE_INSYNC, &sh->state); 1427 set_bit(STRIPE_INSYNC, &sh->state);
@@ -1457,6 +1491,8 @@ static void handle_stripe(struct stripe_head *sh)
1457 bi->bi_next = NULL; 1491 bi->bi_next = NULL;
1458 generic_make_request(bi); 1492 generic_make_request(bi);
1459 } else { 1493 } else {
1494 if (rw == 1)
1495 set_bit(STRIPE_DEGRADED, &sh->state);
1460 PRINTK("skip op %ld on disc %d for sector %llu\n", 1496 PRINTK("skip op %ld on disc %d for sector %llu\n",
1461 bi->bi_rw, i, (unsigned long long)sh->sector); 1497 bi->bi_rw, i, (unsigned long long)sh->sector);
1462 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1498 clear_bit(R5_LOCKED, &sh->dev[i].flags);
@@ -1481,6 +1517,20 @@ static inline void raid6_activate_delayed(raid6_conf_t *conf)
1481 } 1517 }
1482} 1518}
1483 1519
1520static inline void activate_bit_delay(raid6_conf_t *conf)
1521{
1522 /* device_lock is held */
1523 struct list_head head;
1524 list_add(&head, &conf->bitmap_list);
1525 list_del_init(&conf->bitmap_list);
1526 while (!list_empty(&head)) {
1527 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
1528 list_del_init(&sh->lru);
1529 atomic_inc(&sh->count);
1530 __release_stripe(conf, sh);
1531 }
1532}
1533
1484static void unplug_slaves(mddev_t *mddev) 1534static void unplug_slaves(mddev_t *mddev)
1485{ 1535{
1486 raid6_conf_t *conf = mddev_to_conf(mddev); 1536 raid6_conf_t *conf = mddev_to_conf(mddev);
@@ -1513,8 +1563,10 @@ static void raid6_unplug_device(request_queue_t *q)
1513 1563
1514 spin_lock_irqsave(&conf->device_lock, flags); 1564 spin_lock_irqsave(&conf->device_lock, flags);
1515 1565
1516 if (blk_remove_plug(q)) 1566 if (blk_remove_plug(q)) {
1567 conf->seq_flush++;
1517 raid6_activate_delayed(conf); 1568 raid6_activate_delayed(conf);
1569 }
1518 md_wakeup_thread(mddev->thread); 1570 md_wakeup_thread(mddev->thread);
1519 1571
1520 spin_unlock_irqrestore(&conf->device_lock, flags); 1572 spin_unlock_irqrestore(&conf->device_lock, flags);
@@ -1652,10 +1704,20 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1652 sector_t first_sector; 1704 sector_t first_sector;
1653 int raid_disks = conf->raid_disks; 1705 int raid_disks = conf->raid_disks;
1654 int data_disks = raid_disks - 2; 1706 int data_disks = raid_disks - 2;
1707 sector_t max_sector = mddev->size << 1;
1708 int sync_blocks;
1655 1709
1656 if (sector_nr >= mddev->size <<1) { 1710 if (sector_nr >= max_sector) {
1657 /* just being told to finish up .. nothing much to do */ 1711 /* just being told to finish up .. nothing much to do */
1658 unplug_slaves(mddev); 1712 unplug_slaves(mddev);
1713
1714 if (mddev->curr_resync < max_sector) /* aborted */
1715 bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
1716 &sync_blocks, 1);
1717 else /* compelted sync */
1718 conf->fullsync = 0;
1719 bitmap_close_sync(mddev->bitmap);
1720
1659 return 0; 1721 return 0;
1660 } 1722 }
1661 /* if there are 2 or more failed drives and we are trying 1723 /* if there are 2 or more failed drives and we are trying
@@ -1667,6 +1729,13 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1667 *skipped = 1; 1729 *skipped = 1;
1668 return rv; 1730 return rv;
1669 } 1731 }
1732 if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
1733 !conf->fullsync && sync_blocks >= STRIPE_SECTORS) {
1734 /* we can skip this block, and probably more */
1735 sync_blocks /= STRIPE_SECTORS;
1736 *skipped = 1;
1737 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
1738 }
1670 1739
1671 x = sector_nr; 1740 x = sector_nr;
1672 chunk_offset = sector_div(x, sectors_per_chunk); 1741 chunk_offset = sector_div(x, sectors_per_chunk);
@@ -1684,6 +1753,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1684 set_current_state(TASK_UNINTERRUPTIBLE); 1753 set_current_state(TASK_UNINTERRUPTIBLE);
1685 schedule_timeout(1); 1754 schedule_timeout(1);
1686 } 1755 }
1756 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 0);
1687 spin_lock(&sh->lock); 1757 spin_lock(&sh->lock);
1688 set_bit(STRIPE_SYNCING, &sh->state); 1758 set_bit(STRIPE_SYNCING, &sh->state);
1689 clear_bit(STRIPE_INSYNC, &sh->state); 1759 clear_bit(STRIPE_INSYNC, &sh->state);
@@ -1717,6 +1787,13 @@ static void raid6d (mddev_t *mddev)
1717 while (1) { 1787 while (1) {
1718 struct list_head *first; 1788 struct list_head *first;
1719 1789
1790 if (conf->seq_flush - conf->seq_write > 0) {
1791 int seq = conf->seq_flush;
1792 bitmap_unplug(mddev->bitmap);
1793 conf->seq_write = seq;
1794 activate_bit_delay(conf);
1795 }
1796
1720 if (list_empty(&conf->handle_list) && 1797 if (list_empty(&conf->handle_list) &&
1721 atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD && 1798 atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD &&
1722 !blk_queue_plugged(mddev->queue) && 1799 !blk_queue_plugged(mddev->queue) &&
@@ -1750,7 +1827,7 @@ static void raid6d (mddev_t *mddev)
1750 PRINTK("--- raid6d inactive\n"); 1827 PRINTK("--- raid6d inactive\n");
1751} 1828}
1752 1829
1753static int run (mddev_t *mddev) 1830static int run(mddev_t *mddev)
1754{ 1831{
1755 raid6_conf_t *conf; 1832 raid6_conf_t *conf;
1756 int raid_disk, memory; 1833 int raid_disk, memory;
@@ -1780,6 +1857,7 @@ static int run (mddev_t *mddev)
1780 init_waitqueue_head(&conf->wait_for_overlap); 1857 init_waitqueue_head(&conf->wait_for_overlap);
1781 INIT_LIST_HEAD(&conf->handle_list); 1858 INIT_LIST_HEAD(&conf->handle_list);
1782 INIT_LIST_HEAD(&conf->delayed_list); 1859 INIT_LIST_HEAD(&conf->delayed_list);
1860 INIT_LIST_HEAD(&conf->bitmap_list);
1783 INIT_LIST_HEAD(&conf->inactive_list); 1861 INIT_LIST_HEAD(&conf->inactive_list);
1784 atomic_set(&conf->active_stripes, 0); 1862 atomic_set(&conf->active_stripes, 0);
1785 atomic_set(&conf->preread_active_stripes, 0); 1863 atomic_set(&conf->preread_active_stripes, 0);
@@ -1899,6 +1977,9 @@ static int run (mddev_t *mddev)
1899 /* Ok, everything is just fine now */ 1977 /* Ok, everything is just fine now */
1900 mddev->array_size = mddev->size * (mddev->raid_disks - 2); 1978 mddev->array_size = mddev->size * (mddev->raid_disks - 2);
1901 1979
1980 if (mddev->bitmap)
1981 mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
1982
1902 mddev->queue->unplug_fn = raid6_unplug_device; 1983 mddev->queue->unplug_fn = raid6_unplug_device;
1903 mddev->queue->issue_flush_fn = raid6_issue_flush; 1984 mddev->queue->issue_flush_fn = raid6_issue_flush;
1904 return 0; 1985 return 0;
@@ -2076,6 +2157,8 @@ static int raid6_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
2076 rdev->in_sync = 0; 2157 rdev->in_sync = 0;
2077 rdev->raid_disk = disk; 2158 rdev->raid_disk = disk;
2078 found = 1; 2159 found = 1;
2160 if (rdev->saved_raid_disk != disk)
2161 conf->fullsync = 1;
2079 p->rdev = rdev; 2162 p->rdev = rdev;
2080 break; 2163 break;
2081 } 2164 }
@@ -2105,6 +2188,35 @@ static int raid6_resize(mddev_t *mddev, sector_t sectors)
2105 return 0; 2188 return 0;
2106} 2189}
2107 2190
2191static void raid6_quiesce(mddev_t *mddev, int state)
2192{
2193 raid6_conf_t *conf = mddev_to_conf(mddev);
2194
2195 switch(state) {
2196 case 1: /* stop all writes */
2197 spin_lock_irq(&conf->device_lock);
2198 conf->quiesce = 1;
2199 wait_event_lock_irq(conf->wait_for_stripe,
2200 atomic_read(&conf->active_stripes) == 0,
2201 conf->device_lock, /* nothing */);
2202 spin_unlock_irq(&conf->device_lock);
2203 break;
2204
2205 case 0: /* re-enable writes */
2206 spin_lock_irq(&conf->device_lock);
2207 conf->quiesce = 0;
2208 wake_up(&conf->wait_for_stripe);
2209 spin_unlock_irq(&conf->device_lock);
2210 break;
2211 }
2212 if (mddev->thread) {
2213 if (mddev->bitmap)
2214 mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
2215 else
2216 mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
2217 md_wakeup_thread(mddev->thread);
2218 }
2219}
2108static mdk_personality_t raid6_personality= 2220static mdk_personality_t raid6_personality=
2109{ 2221{
2110 .name = "raid6", 2222 .name = "raid6",
@@ -2119,6 +2231,7 @@ static mdk_personality_t raid6_personality=
2119 .spare_active = raid6_spare_active, 2231 .spare_active = raid6_spare_active,
2120 .sync_request = sync_request, 2232 .sync_request = sync_request,
2121 .resize = raid6_resize, 2233 .resize = raid6_resize,
2234 .quiesce = raid6_quiesce,
2122}; 2235};
2123 2236
2124static int __init raid6_init (void) 2237static int __init raid6_init (void)