aboutsummaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorNeilBrown <neilb@cse.unsw.edu.au>2005-09-09 19:23:54 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2005-09-09 19:39:12 -0400
commit72626685dc66d455742a7f215a0535c551628b9e (patch)
tree91e19a61a5a3b782007132b6b2e353e8936dd656 /drivers
parent0002b2718dd04da67c21f8a7830de8d95a9b0345 (diff)
[PATCH] md: add write-intent-bitmap support to raid5
Most awkward part of this is delaying write requests until bitmap updates have been flushed. To achieve this, we have a sequence number (seq_flush) which is incremented each time the raid5 is unplugged. If the raid thread notices that this has changed, it flushes bitmap changes, and assigned the value of seq_flush to seq_write. When a write request arrives, it is given the number from seq_write, and that write request may not complete until seq_flush is larger than the saved seq number. We have a new queue for storing stripes which are waiting for a bitmap flush and an extra flag for stripes to record if the write was 'degraded' and so should not clear the a bit in the bitmap. Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'drivers')
-rw-r--r--drivers/md/md.c3
-rw-r--r--drivers/md/raid5.c133
2 files changed, 124 insertions, 12 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index be7873c61b3c..dbf540a7fccc 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -645,7 +645,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
645 645
646 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 646 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
647 mddev->bitmap_file == NULL) { 647 mddev->bitmap_file == NULL) {
648 if (mddev->level != 1) { 648 if (mddev->level != 1 && mddev->level != 5) {
649 /* FIXME use a better test */ 649 /* FIXME use a better test */
650 printk(KERN_WARNING "md: bitmaps only support for raid1\n"); 650 printk(KERN_WARNING "md: bitmaps only support for raid1\n");
651 return -EINVAL; 651 return -EINVAL;
@@ -3517,7 +3517,6 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok)
3517 */ 3517 */
3518void md_write_start(mddev_t *mddev, struct bio *bi) 3518void md_write_start(mddev_t *mddev, struct bio *bi)
3519{ 3519{
3520 DEFINE_WAIT(w);
3521 if (bio_data_dir(bi) != WRITE) 3520 if (bio_data_dir(bi) != WRITE)
3522 return; 3521 return;
3523 3522
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index ed859e08d600..4683ca24c046 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -24,6 +24,8 @@
24#include <linux/bitops.h> 24#include <linux/bitops.h>
25#include <asm/atomic.h> 25#include <asm/atomic.h>
26 26
27#include <linux/raid/bitmap.h>
28
27/* 29/*
28 * Stripe cache 30 * Stripe cache
29 */ 31 */
@@ -79,8 +81,13 @@ static inline void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
79 if (test_bit(STRIPE_HANDLE, &sh->state)) { 81 if (test_bit(STRIPE_HANDLE, &sh->state)) {
80 if (test_bit(STRIPE_DELAYED, &sh->state)) 82 if (test_bit(STRIPE_DELAYED, &sh->state))
81 list_add_tail(&sh->lru, &conf->delayed_list); 83 list_add_tail(&sh->lru, &conf->delayed_list);
82 else 84 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
85 conf->seq_write == sh->bm_seq)
86 list_add_tail(&sh->lru, &conf->bitmap_list);
87 else {
88 clear_bit(STRIPE_BIT_DELAY, &sh->state);
83 list_add_tail(&sh->lru, &conf->handle_list); 89 list_add_tail(&sh->lru, &conf->handle_list);
90 }
84 md_wakeup_thread(conf->mddev->thread); 91 md_wakeup_thread(conf->mddev->thread);
85 } else { 92 } else {
86 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 93 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
@@ -244,6 +251,9 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
244 spin_lock_irq(&conf->device_lock); 251 spin_lock_irq(&conf->device_lock);
245 252
246 do { 253 do {
254 wait_event_lock_irq(conf->wait_for_stripe,
255 conf->quiesce == 0,
256 conf->device_lock, /* nothing */);
247 sh = __find_stripe(conf, sector); 257 sh = __find_stripe(conf, sector);
248 if (!sh) { 258 if (!sh) {
249 if (!conf->inactive_blocked) 259 if (!conf->inactive_blocked)
@@ -803,6 +813,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
803{ 813{
804 struct bio **bip; 814 struct bio **bip;
805 raid5_conf_t *conf = sh->raid_conf; 815 raid5_conf_t *conf = sh->raid_conf;
816 int firstwrite=0;
806 817
807 PRINTK("adding bh b#%llu to stripe s#%llu\n", 818 PRINTK("adding bh b#%llu to stripe s#%llu\n",
808 (unsigned long long)bi->bi_sector, 819 (unsigned long long)bi->bi_sector,
@@ -811,9 +822,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
811 822
812 spin_lock(&sh->lock); 823 spin_lock(&sh->lock);
813 spin_lock_irq(&conf->device_lock); 824 spin_lock_irq(&conf->device_lock);
814 if (forwrite) 825 if (forwrite) {
815 bip = &sh->dev[dd_idx].towrite; 826 bip = &sh->dev[dd_idx].towrite;
816 else 827 if (*bip == NULL && sh->dev[dd_idx].written == NULL)
828 firstwrite = 1;
829 } else
817 bip = &sh->dev[dd_idx].toread; 830 bip = &sh->dev[dd_idx].toread;
818 while (*bip && (*bip)->bi_sector < bi->bi_sector) { 831 while (*bip && (*bip)->bi_sector < bi->bi_sector) {
819 if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector) 832 if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector)
@@ -836,6 +849,13 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
836 (unsigned long long)bi->bi_sector, 849 (unsigned long long)bi->bi_sector,
837 (unsigned long long)sh->sector, dd_idx); 850 (unsigned long long)sh->sector, dd_idx);
838 851
852 if (conf->mddev->bitmap && firstwrite) {
853 sh->bm_seq = conf->seq_write;
854 bitmap_startwrite(conf->mddev->bitmap, sh->sector,
855 STRIPE_SECTORS, 0);
856 set_bit(STRIPE_BIT_DELAY, &sh->state);
857 }
858
839 if (forwrite) { 859 if (forwrite) {
840 /* check if page is covered */ 860 /* check if page is covered */
841 sector_t sector = sh->dev[dd_idx].sector; 861 sector_t sector = sh->dev[dd_idx].sector;
@@ -958,12 +978,13 @@ static void handle_stripe(struct stripe_head *sh)
958 * need to be failed 978 * need to be failed
959 */ 979 */
960 if (failed > 1 && to_read+to_write+written) { 980 if (failed > 1 && to_read+to_write+written) {
961 spin_lock_irq(&conf->device_lock);
962 for (i=disks; i--; ) { 981 for (i=disks; i--; ) {
982 int bitmap_end = 0;
983 spin_lock_irq(&conf->device_lock);
963 /* fail all writes first */ 984 /* fail all writes first */
964 bi = sh->dev[i].towrite; 985 bi = sh->dev[i].towrite;
965 sh->dev[i].towrite = NULL; 986 sh->dev[i].towrite = NULL;
966 if (bi) to_write--; 987 if (bi) { to_write--; bitmap_end = 1; }
967 988
968 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 989 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
969 wake_up(&conf->wait_for_overlap); 990 wake_up(&conf->wait_for_overlap);
@@ -981,6 +1002,7 @@ static void handle_stripe(struct stripe_head *sh)
981 /* and fail all 'written' */ 1002 /* and fail all 'written' */
982 bi = sh->dev[i].written; 1003 bi = sh->dev[i].written;
983 sh->dev[i].written = NULL; 1004 sh->dev[i].written = NULL;
1005 if (bi) bitmap_end = 1;
984 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { 1006 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) {
985 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); 1007 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
986 clear_bit(BIO_UPTODATE, &bi->bi_flags); 1008 clear_bit(BIO_UPTODATE, &bi->bi_flags);
@@ -1009,8 +1031,11 @@ static void handle_stripe(struct stripe_head *sh)
1009 bi = nextbi; 1031 bi = nextbi;
1010 } 1032 }
1011 } 1033 }
1034 spin_unlock_irq(&conf->device_lock);
1035 if (bitmap_end)
1036 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
1037 STRIPE_SECTORS, 0, 0);
1012 } 1038 }
1013 spin_unlock_irq(&conf->device_lock);
1014 } 1039 }
1015 if (failed > 1 && syncing) { 1040 if (failed > 1 && syncing) {
1016 md_done_sync(conf->mddev, STRIPE_SECTORS,0); 1041 md_done_sync(conf->mddev, STRIPE_SECTORS,0);
@@ -1038,6 +1063,7 @@ static void handle_stripe(struct stripe_head *sh)
1038 test_bit(R5_UPTODATE, &dev->flags) ) { 1063 test_bit(R5_UPTODATE, &dev->flags) ) {
1039 /* We can return any write requests */ 1064 /* We can return any write requests */
1040 struct bio *wbi, *wbi2; 1065 struct bio *wbi, *wbi2;
1066 int bitmap_end = 0;
1041 PRINTK("Return write for disc %d\n", i); 1067 PRINTK("Return write for disc %d\n", i);
1042 spin_lock_irq(&conf->device_lock); 1068 spin_lock_irq(&conf->device_lock);
1043 wbi = dev->written; 1069 wbi = dev->written;
@@ -1051,7 +1077,13 @@ static void handle_stripe(struct stripe_head *sh)
1051 } 1077 }
1052 wbi = wbi2; 1078 wbi = wbi2;
1053 } 1079 }
1080 if (dev->towrite == NULL)
1081 bitmap_end = 1;
1054 spin_unlock_irq(&conf->device_lock); 1082 spin_unlock_irq(&conf->device_lock);
1083 if (bitmap_end)
1084 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
1085 STRIPE_SECTORS,
1086 !test_bit(STRIPE_DEGRADED, &sh->state), 0);
1055 } 1087 }
1056 } 1088 }
1057 } 1089 }
@@ -1175,7 +1207,8 @@ static void handle_stripe(struct stripe_head *sh)
1175 } 1207 }
1176 } 1208 }
1177 /* now if nothing is locked, and if we have enough data, we can start a write request */ 1209 /* now if nothing is locked, and if we have enough data, we can start a write request */
1178 if (locked == 0 && (rcw == 0 ||rmw == 0)) { 1210 if (locked == 0 && (rcw == 0 ||rmw == 0) &&
1211 !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
1179 PRINTK("Computing parity...\n"); 1212 PRINTK("Computing parity...\n");
1180 compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE); 1213 compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
1181 /* now every locked buffer is ready to be written */ 1214 /* now every locked buffer is ready to be written */
@@ -1231,6 +1264,7 @@ static void handle_stripe(struct stripe_head *sh)
1231 dev = &sh->dev[failed_num]; 1264 dev = &sh->dev[failed_num];
1232 set_bit(R5_LOCKED, &dev->flags); 1265 set_bit(R5_LOCKED, &dev->flags);
1233 set_bit(R5_Wantwrite, &dev->flags); 1266 set_bit(R5_Wantwrite, &dev->flags);
1267 clear_bit(STRIPE_DEGRADED, &sh->state);
1234 locked++; 1268 locked++;
1235 set_bit(STRIPE_INSYNC, &sh->state); 1269 set_bit(STRIPE_INSYNC, &sh->state);
1236 set_bit(R5_Syncio, &dev->flags); 1270 set_bit(R5_Syncio, &dev->flags);
@@ -1298,6 +1332,8 @@ static void handle_stripe(struct stripe_head *sh)
1298 bi->bi_next = NULL; 1332 bi->bi_next = NULL;
1299 generic_make_request(bi); 1333 generic_make_request(bi);
1300 } else { 1334 } else {
1335 if (rw == 1)
1336 set_bit(STRIPE_DEGRADED, &sh->state);
1301 PRINTK("skip op %ld on disc %d for sector %llu\n", 1337 PRINTK("skip op %ld on disc %d for sector %llu\n",
1302 bi->bi_rw, i, (unsigned long long)sh->sector); 1338 bi->bi_rw, i, (unsigned long long)sh->sector);
1303 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1339 clear_bit(R5_LOCKED, &sh->dev[i].flags);
@@ -1322,6 +1358,20 @@ static inline void raid5_activate_delayed(raid5_conf_t *conf)
1322 } 1358 }
1323} 1359}
1324 1360
1361static inline void activate_bit_delay(raid5_conf_t *conf)
1362{
1363 /* device_lock is held */
1364 struct list_head head;
1365 list_add(&head, &conf->bitmap_list);
1366 list_del_init(&conf->bitmap_list);
1367 while (!list_empty(&head)) {
1368 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
1369 list_del_init(&sh->lru);
1370 atomic_inc(&sh->count);
1371 __release_stripe(conf, sh);
1372 }
1373}
1374
1325static void unplug_slaves(mddev_t *mddev) 1375static void unplug_slaves(mddev_t *mddev)
1326{ 1376{
1327 raid5_conf_t *conf = mddev_to_conf(mddev); 1377 raid5_conf_t *conf = mddev_to_conf(mddev);
@@ -1354,8 +1404,10 @@ static void raid5_unplug_device(request_queue_t *q)
1354 1404
1355 spin_lock_irqsave(&conf->device_lock, flags); 1405 spin_lock_irqsave(&conf->device_lock, flags);
1356 1406
1357 if (blk_remove_plug(q)) 1407 if (blk_remove_plug(q)) {
1408 conf->seq_flush++;
1358 raid5_activate_delayed(conf); 1409 raid5_activate_delayed(conf);
1410 }
1359 md_wakeup_thread(mddev->thread); 1411 md_wakeup_thread(mddev->thread);
1360 1412
1361 spin_unlock_irqrestore(&conf->device_lock, flags); 1413 spin_unlock_irqrestore(&conf->device_lock, flags);
@@ -1493,10 +1545,20 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1493 sector_t first_sector; 1545 sector_t first_sector;
1494 int raid_disks = conf->raid_disks; 1546 int raid_disks = conf->raid_disks;
1495 int data_disks = raid_disks-1; 1547 int data_disks = raid_disks-1;
1548 sector_t max_sector = mddev->size << 1;
1549 int sync_blocks;
1496 1550
1497 if (sector_nr >= mddev->size <<1) { 1551 if (sector_nr >= max_sector) {
1498 /* just being told to finish up .. nothing much to do */ 1552 /* just being told to finish up .. nothing much to do */
1499 unplug_slaves(mddev); 1553 unplug_slaves(mddev);
1554
1555 if (mddev->curr_resync < max_sector) /* aborted */
1556 bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
1557 &sync_blocks, 1);
1558 else /* compelted sync */
1559 conf->fullsync = 0;
1560 bitmap_close_sync(mddev->bitmap);
1561
1500 return 0; 1562 return 0;
1501 } 1563 }
1502 /* if there is 1 or more failed drives and we are trying 1564 /* if there is 1 or more failed drives and we are trying
@@ -1508,6 +1570,13 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1508 *skipped = 1; 1570 *skipped = 1;
1509 return rv; 1571 return rv;
1510 } 1572 }
1573 if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
1574 !conf->fullsync && sync_blocks >= STRIPE_SECTORS) {
1575 /* we can skip this block, and probably more */
1576 sync_blocks /= STRIPE_SECTORS;
1577 *skipped = 1;
1578 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
1579 }
1511 1580
1512 x = sector_nr; 1581 x = sector_nr;
1513 chunk_offset = sector_div(x, sectors_per_chunk); 1582 chunk_offset = sector_div(x, sectors_per_chunk);
@@ -1525,6 +1594,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1525 set_current_state(TASK_UNINTERRUPTIBLE); 1594 set_current_state(TASK_UNINTERRUPTIBLE);
1526 schedule_timeout(1); 1595 schedule_timeout(1);
1527 } 1596 }
1597 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 0);
1528 spin_lock(&sh->lock); 1598 spin_lock(&sh->lock);
1529 set_bit(STRIPE_SYNCING, &sh->state); 1599 set_bit(STRIPE_SYNCING, &sh->state);
1530 clear_bit(STRIPE_INSYNC, &sh->state); 1600 clear_bit(STRIPE_INSYNC, &sh->state);
@@ -1558,6 +1628,13 @@ static void raid5d (mddev_t *mddev)
1558 while (1) { 1628 while (1) {
1559 struct list_head *first; 1629 struct list_head *first;
1560 1630
1631 if (conf->seq_flush - conf->seq_write > 0) {
1632 int seq = conf->seq_flush;
1633 bitmap_unplug(mddev->bitmap);
1634 conf->seq_write = seq;
1635 activate_bit_delay(conf);
1636 }
1637
1561 if (list_empty(&conf->handle_list) && 1638 if (list_empty(&conf->handle_list) &&
1562 atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD && 1639 atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD &&
1563 !blk_queue_plugged(mddev->queue) && 1640 !blk_queue_plugged(mddev->queue) &&
@@ -1591,7 +1668,7 @@ static void raid5d (mddev_t *mddev)
1591 PRINTK("--- raid5d inactive\n"); 1668 PRINTK("--- raid5d inactive\n");
1592} 1669}
1593 1670
1594static int run (mddev_t *mddev) 1671static int run(mddev_t *mddev)
1595{ 1672{
1596 raid5_conf_t *conf; 1673 raid5_conf_t *conf;
1597 int raid_disk, memory; 1674 int raid_disk, memory;
@@ -1621,6 +1698,7 @@ static int run (mddev_t *mddev)
1621 init_waitqueue_head(&conf->wait_for_overlap); 1698 init_waitqueue_head(&conf->wait_for_overlap);
1622 INIT_LIST_HEAD(&conf->handle_list); 1699 INIT_LIST_HEAD(&conf->handle_list);
1623 INIT_LIST_HEAD(&conf->delayed_list); 1700 INIT_LIST_HEAD(&conf->delayed_list);
1701 INIT_LIST_HEAD(&conf->bitmap_list);
1624 INIT_LIST_HEAD(&conf->inactive_list); 1702 INIT_LIST_HEAD(&conf->inactive_list);
1625 atomic_set(&conf->active_stripes, 0); 1703 atomic_set(&conf->active_stripes, 0);
1626 atomic_set(&conf->preread_active_stripes, 0); 1704 atomic_set(&conf->preread_active_stripes, 0);
@@ -1732,6 +1810,9 @@ memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
1732 1810
1733 /* Ok, everything is just fine now */ 1811 /* Ok, everything is just fine now */
1734 1812
1813 if (mddev->bitmap)
1814 mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
1815
1735 mddev->queue->unplug_fn = raid5_unplug_device; 1816 mddev->queue->unplug_fn = raid5_unplug_device;
1736 mddev->queue->issue_flush_fn = raid5_issue_flush; 1817 mddev->queue->issue_flush_fn = raid5_issue_flush;
1737 1818
@@ -1912,6 +1993,8 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1912 rdev->in_sync = 0; 1993 rdev->in_sync = 0;
1913 rdev->raid_disk = disk; 1994 rdev->raid_disk = disk;
1914 found = 1; 1995 found = 1;
1996 if (rdev->saved_raid_disk != disk)
1997 conf->fullsync = 1;
1915 p->rdev = rdev; 1998 p->rdev = rdev;
1916 break; 1999 break;
1917 } 2000 }
@@ -1941,6 +2024,35 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
1941 return 0; 2024 return 0;
1942} 2025}
1943 2026
2027static void raid5_quiesce(mddev_t *mddev, int state)
2028{
2029 raid5_conf_t *conf = mddev_to_conf(mddev);
2030
2031 switch(state) {
2032 case 1: /* stop all writes */
2033 spin_lock_irq(&conf->device_lock);
2034 conf->quiesce = 1;
2035 wait_event_lock_irq(conf->wait_for_stripe,
2036 atomic_read(&conf->active_stripes) == 0,
2037 conf->device_lock, /* nothing */);
2038 spin_unlock_irq(&conf->device_lock);
2039 break;
2040
2041 case 0: /* re-enable writes */
2042 spin_lock_irq(&conf->device_lock);
2043 conf->quiesce = 0;
2044 wake_up(&conf->wait_for_stripe);
2045 spin_unlock_irq(&conf->device_lock);
2046 break;
2047 }
2048 if (mddev->thread) {
2049 if (mddev->bitmap)
2050 mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
2051 else
2052 mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
2053 md_wakeup_thread(mddev->thread);
2054 }
2055}
1944static mdk_personality_t raid5_personality= 2056static mdk_personality_t raid5_personality=
1945{ 2057{
1946 .name = "raid5", 2058 .name = "raid5",
@@ -1955,6 +2067,7 @@ static mdk_personality_t raid5_personality=
1955 .spare_active = raid5_spare_active, 2067 .spare_active = raid5_spare_active,
1956 .sync_request = sync_request, 2068 .sync_request = sync_request,
1957 .resize = raid5_resize, 2069 .resize = raid5_resize,
2070 .quiesce = raid5_quiesce,
1958}; 2071};
1959 2072
1960static int __init raid5_init (void) 2073static int __init raid5_init (void)