aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/raid10.c
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.de>2006-01-06 03:20:16 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2006-01-06 11:34:03 -0500
commit6cce3b23f6f8e974c00af7a9b88f1d413ba368a8 (patch)
tree8c156f26a10de82b1626d74f810704cfd6f5ba2b /drivers/md/raid10.c
parentb15c2e57f0f5bf596a19e9c5571e5b07cdfc7363 (diff)
[PATCH] md: write intent bitmap support for raid10
Signed-off-by: Neil Brown <neilb@suse.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'drivers/md/raid10.c')
-rw-r--r--drivers/md/raid10.c178
1 files changed, 157 insertions, 21 deletions
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 50bd7b152f28..8f58a447d9f0 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -18,7 +18,9 @@
18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 */ 19 */
20 20
21#include "dm-bio-list.h"
21#include <linux/raid/raid10.h> 22#include <linux/raid/raid10.h>
23#include <linux/raid/bitmap.h>
22 24
23/* 25/*
24 * RAID10 provides a combination of RAID0 and RAID1 functionality. 26 * RAID10 provides a combination of RAID0 and RAID1 functionality.
@@ -306,9 +308,11 @@ static int raid10_end_write_request(struct bio *bio, unsigned int bytes_done, in
306 /* 308 /*
307 * this branch is our 'one mirror IO has finished' event handler: 309 * this branch is our 'one mirror IO has finished' event handler:
308 */ 310 */
309 if (!uptodate) 311 if (!uptodate) {
310 md_error(r10_bio->mddev, conf->mirrors[dev].rdev); 312 md_error(r10_bio->mddev, conf->mirrors[dev].rdev);
311 else 313 /* an I/O failed, we can't clear the bitmap */
314 set_bit(R10BIO_Degraded, &r10_bio->state);
315 } else
312 /* 316 /*
313 * Set R10BIO_Uptodate in our master bio, so that 317 * Set R10BIO_Uptodate in our master bio, so that
314 * we will return a good error code for to the higher 318 * we will return a good error code for to the higher
@@ -328,6 +332,11 @@ static int raid10_end_write_request(struct bio *bio, unsigned int bytes_done, in
328 * already. 332 * already.
329 */ 333 */
330 if (atomic_dec_and_test(&r10_bio->remaining)) { 334 if (atomic_dec_and_test(&r10_bio->remaining)) {
335 /* clear the bitmap if all writes complete successfully */
336 bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
337 r10_bio->sectors,
338 !test_bit(R10BIO_Degraded, &r10_bio->state),
339 0);
331 md_write_end(r10_bio->mddev); 340 md_write_end(r10_bio->mddev);
332 raid_end_bio_io(r10_bio); 341 raid_end_bio_io(r10_bio);
333 } 342 }
@@ -486,8 +495,9 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
486 rcu_read_lock(); 495 rcu_read_lock();
487 /* 496 /*
488 * Check if we can balance. We can balance on the whole 497 * Check if we can balance. We can balance on the whole
489 * device if no resync is going on, or below the resync window. 498 * device if no resync is going on (recovery is ok), or below
490 * We take the first readable disk when above the resync window. 499 * the resync window. We take the first readable disk when
500 * above the resync window.
491 */ 501 */
492 if (conf->mddev->recovery_cp < MaxSector 502 if (conf->mddev->recovery_cp < MaxSector
493 && (this_sector + sectors >= conf->next_resync)) { 503 && (this_sector + sectors >= conf->next_resync)) {
@@ -591,7 +601,10 @@ static void unplug_slaves(mddev_t *mddev)
591 601
592static void raid10_unplug(request_queue_t *q) 602static void raid10_unplug(request_queue_t *q)
593{ 603{
604 mddev_t *mddev = q->queuedata;
605
594 unplug_slaves(q->queuedata); 606 unplug_slaves(q->queuedata);
607 md_wakeup_thread(mddev->thread);
595} 608}
596 609
597static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk, 610static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk,
@@ -647,12 +660,13 @@ static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk,
647 */ 660 */
648#define RESYNC_DEPTH 32 661#define RESYNC_DEPTH 32
649 662
650static void raise_barrier(conf_t *conf) 663static void raise_barrier(conf_t *conf, int force)
651{ 664{
665 BUG_ON(force && !conf->barrier);
652 spin_lock_irq(&conf->resync_lock); 666 spin_lock_irq(&conf->resync_lock);
653 667
654 /* Wait until no block IO is waiting */ 668 /* Wait until no block IO is waiting (unless 'force') */
655 wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting, 669 wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
656 conf->resync_lock, 670 conf->resync_lock,
657 raid10_unplug(conf->mddev->queue)); 671 raid10_unplug(conf->mddev->queue));
658 672
@@ -710,6 +724,8 @@ static int make_request(request_queue_t *q, struct bio * bio)
710 int i; 724 int i;
711 int chunk_sects = conf->chunk_mask + 1; 725 int chunk_sects = conf->chunk_mask + 1;
712 const int rw = bio_data_dir(bio); 726 const int rw = bio_data_dir(bio);
727 struct bio_list bl;
728 unsigned long flags;
713 729
714 if (unlikely(bio_barrier(bio))) { 730 if (unlikely(bio_barrier(bio))) {
715 bio_endio(bio, bio->bi_size, -EOPNOTSUPP); 731 bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
@@ -767,6 +783,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
767 783
768 r10_bio->mddev = mddev; 784 r10_bio->mddev = mddev;
769 r10_bio->sector = bio->bi_sector; 785 r10_bio->sector = bio->bi_sector;
786 r10_bio->state = 0;
770 787
771 if (rw == READ) { 788 if (rw == READ) {
772 /* 789 /*
@@ -811,13 +828,16 @@ static int make_request(request_queue_t *q, struct bio * bio)
811 !test_bit(Faulty, &rdev->flags)) { 828 !test_bit(Faulty, &rdev->flags)) {
812 atomic_inc(&rdev->nr_pending); 829 atomic_inc(&rdev->nr_pending);
813 r10_bio->devs[i].bio = bio; 830 r10_bio->devs[i].bio = bio;
814 } else 831 } else {
815 r10_bio->devs[i].bio = NULL; 832 r10_bio->devs[i].bio = NULL;
833 set_bit(R10BIO_Degraded, &r10_bio->state);
834 }
816 } 835 }
817 rcu_read_unlock(); 836 rcu_read_unlock();
818 837
819 atomic_set(&r10_bio->remaining, 1); 838 atomic_set(&r10_bio->remaining, 0);
820 839
840 bio_list_init(&bl);
821 for (i = 0; i < conf->copies; i++) { 841 for (i = 0; i < conf->copies; i++) {
822 struct bio *mbio; 842 struct bio *mbio;
823 int d = r10_bio->devs[i].devnum; 843 int d = r10_bio->devs[i].devnum;
@@ -835,13 +855,14 @@ static int make_request(request_queue_t *q, struct bio * bio)
835 mbio->bi_private = r10_bio; 855 mbio->bi_private = r10_bio;
836 856
837 atomic_inc(&r10_bio->remaining); 857 atomic_inc(&r10_bio->remaining);
838 generic_make_request(mbio); 858 bio_list_add(&bl, mbio);
839 } 859 }
840 860
841 if (atomic_dec_and_test(&r10_bio->remaining)) { 861 bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0);
842 md_write_end(mddev); 862 spin_lock_irqsave(&conf->device_lock, flags);
843 raid_end_bio_io(r10_bio); 863 bio_list_merge(&conf->pending_bio_list, &bl);
844 } 864 blk_plug_device(mddev->queue);
865 spin_unlock_irqrestore(&conf->device_lock, flags);
845 866
846 return 0; 867 return 0;
847} 868}
@@ -999,7 +1020,12 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
999 if (!enough(conf)) 1020 if (!enough(conf))
1000 return 0; 1021 return 0;
1001 1022
1002 for (mirror=0; mirror < mddev->raid_disks; mirror++) 1023 if (rdev->saved_raid_disk >= 0 &&
1024 conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1025 mirror = rdev->saved_raid_disk;
1026 else
1027 mirror = 0;
1028 for ( ; mirror < mddev->raid_disks; mirror++)
1003 if ( !(p=conf->mirrors+mirror)->rdev) { 1029 if ( !(p=conf->mirrors+mirror)->rdev) {
1004 1030
1005 blk_queue_stack_limits(mddev->queue, 1031 blk_queue_stack_limits(mddev->queue,
@@ -1015,6 +1041,8 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1015 p->head_position = 0; 1041 p->head_position = 0;
1016 rdev->raid_disk = mirror; 1042 rdev->raid_disk = mirror;
1017 found = 1; 1043 found = 1;
1044 if (rdev->saved_raid_disk != mirror)
1045 conf->fullsync = 1;
1018 rcu_assign_pointer(p->rdev, rdev); 1046 rcu_assign_pointer(p->rdev, rdev);
1019 break; 1047 break;
1020 } 1048 }
@@ -1282,6 +1310,26 @@ static void raid10d(mddev_t *mddev)
1282 for (;;) { 1310 for (;;) {
1283 char b[BDEVNAME_SIZE]; 1311 char b[BDEVNAME_SIZE];
1284 spin_lock_irqsave(&conf->device_lock, flags); 1312 spin_lock_irqsave(&conf->device_lock, flags);
1313
1314 if (conf->pending_bio_list.head) {
1315 bio = bio_list_get(&conf->pending_bio_list);
1316 blk_remove_plug(mddev->queue);
1317 spin_unlock_irqrestore(&conf->device_lock, flags);
1318 /* flush any pending bitmap writes to disk before proceeding w/ I/O */
1319 if (bitmap_unplug(mddev->bitmap) != 0)
1320 printk("%s: bitmap file write failed!\n", mdname(mddev));
1321
1322 while (bio) { /* submit pending writes */
1323 struct bio *next = bio->bi_next;
1324 bio->bi_next = NULL;
1325 generic_make_request(bio);
1326 bio = next;
1327 }
1328 unplug = 1;
1329
1330 continue;
1331 }
1332
1285 if (list_empty(head)) 1333 if (list_empty(head))
1286 break; 1334 break;
1287 r10_bio = list_entry(head->prev, r10bio_t, retry_list); 1335 r10_bio = list_entry(head->prev, r10bio_t, retry_list);
@@ -1388,6 +1436,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1388 sector_t max_sector, nr_sectors; 1436 sector_t max_sector, nr_sectors;
1389 int disk; 1437 int disk;
1390 int i; 1438 int i;
1439 int max_sync;
1440 int sync_blocks;
1391 1441
1392 sector_t sectors_skipped = 0; 1442 sector_t sectors_skipped = 0;
1393 int chunks_skipped = 0; 1443 int chunks_skipped = 0;
@@ -1401,6 +1451,29 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1401 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 1451 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
1402 max_sector = mddev->resync_max_sectors; 1452 max_sector = mddev->resync_max_sectors;
1403 if (sector_nr >= max_sector) { 1453 if (sector_nr >= max_sector) {
1454 /* If we aborted, we need to abort the
1455 * sync on the 'current' bitmap chucks (there can
1456 * be several when recovering multiple devices).
1457 * as we may have started syncing it but not finished.
1458 * We can find the current address in
1459 * mddev->curr_resync, but for recovery,
1460 * we need to convert that to several
1461 * virtual addresses.
1462 */
1463 if (mddev->curr_resync < max_sector) { /* aborted */
1464 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
1465 bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
1466 &sync_blocks, 1);
1467 else for (i=0; i<conf->raid_disks; i++) {
1468 sector_t sect =
1469 raid10_find_virt(conf, mddev->curr_resync, i);
1470 bitmap_end_sync(mddev->bitmap, sect,
1471 &sync_blocks, 1);
1472 }
1473 } else /* completed sync */
1474 conf->fullsync = 0;
1475
1476 bitmap_close_sync(mddev->bitmap);
1404 close_sync(conf); 1477 close_sync(conf);
1405 *skipped = 1; 1478 *skipped = 1;
1406 return sectors_skipped; 1479 return sectors_skipped;
@@ -1425,8 +1498,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1425 */ 1498 */
1426 if (!go_faster && conf->nr_waiting) 1499 if (!go_faster && conf->nr_waiting)
1427 msleep_interruptible(1000); 1500 msleep_interruptible(1000);
1428 raise_barrier(conf);
1429 conf->next_resync = sector_nr;
1430 1501
1431 /* Again, very different code for resync and recovery. 1502 /* Again, very different code for resync and recovery.
1432 * Both must result in an r10bio with a list of bios that 1503 * Both must result in an r10bio with a list of bios that
@@ -1443,6 +1514,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1443 * end_sync_write if we will want to write. 1514 * end_sync_write if we will want to write.
1444 */ 1515 */
1445 1516
1517 max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
1446 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 1518 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
1447 /* recovery... the complicated one */ 1519 /* recovery... the complicated one */
1448 int i, j, k; 1520 int i, j, k;
@@ -1451,13 +1523,29 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1451 for (i=0 ; i<conf->raid_disks; i++) 1523 for (i=0 ; i<conf->raid_disks; i++)
1452 if (conf->mirrors[i].rdev && 1524 if (conf->mirrors[i].rdev &&
1453 !test_bit(In_sync, &conf->mirrors[i].rdev->flags)) { 1525 !test_bit(In_sync, &conf->mirrors[i].rdev->flags)) {
1526 int still_degraded = 0;
1454 /* want to reconstruct this device */ 1527 /* want to reconstruct this device */
1455 r10bio_t *rb2 = r10_bio; 1528 r10bio_t *rb2 = r10_bio;
1529 sector_t sect = raid10_find_virt(conf, sector_nr, i);
1530 int must_sync;
1531 /* Unless we are doing a full sync, we only need
1532 * to recover the block if it is set in the bitmap
1533 */
1534 must_sync = bitmap_start_sync(mddev->bitmap, sect,
1535 &sync_blocks, 1);
1536 if (sync_blocks < max_sync)
1537 max_sync = sync_blocks;
1538 if (!must_sync &&
1539 !conf->fullsync) {
1540 /* yep, skip the sync_blocks here, but don't assume
1541 * that there will never be anything to do here
1542 */
1543 chunks_skipped = -1;
1544 continue;
1545 }
1456 1546
1457 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); 1547 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
1458 spin_lock_irq(&conf->resync_lock); 1548 raise_barrier(conf, rb2 != NULL);
1459 if (rb2) conf->barrier++;
1460 spin_unlock_irq(&conf->resync_lock);
1461 atomic_set(&r10_bio->remaining, 0); 1549 atomic_set(&r10_bio->remaining, 0);
1462 1550
1463 r10_bio->master_bio = (struct bio*)rb2; 1551 r10_bio->master_bio = (struct bio*)rb2;
@@ -1465,8 +1553,21 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1465 atomic_inc(&rb2->remaining); 1553 atomic_inc(&rb2->remaining);
1466 r10_bio->mddev = mddev; 1554 r10_bio->mddev = mddev;
1467 set_bit(R10BIO_IsRecover, &r10_bio->state); 1555 set_bit(R10BIO_IsRecover, &r10_bio->state);
1468 r10_bio->sector = raid10_find_virt(conf, sector_nr, i); 1556 r10_bio->sector = sect;
1557
1469 raid10_find_phys(conf, r10_bio); 1558 raid10_find_phys(conf, r10_bio);
1559 /* Need to check if this section will still be
1560 * degraded
1561 */
1562 for (j=0; j<conf->copies;j++) {
1563 int d = r10_bio->devs[j].devnum;
1564 if (conf->mirrors[d].rdev == NULL ||
1565 test_bit(Faulty, &conf->mirrors[d].rdev->flags))
1566 still_degraded = 1;
1567 }
1568 must_sync = bitmap_start_sync(mddev->bitmap, sect,
1569 &sync_blocks, still_degraded);
1570
1470 for (j=0; j<conf->copies;j++) { 1571 for (j=0; j<conf->copies;j++) {
1471 int d = r10_bio->devs[j].devnum; 1572 int d = r10_bio->devs[j].devnum;
1472 if (conf->mirrors[d].rdev && 1573 if (conf->mirrors[d].rdev &&
@@ -1526,10 +1627,22 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1526 } else { 1627 } else {
1527 /* resync. Schedule a read for every block at this virt offset */ 1628 /* resync. Schedule a read for every block at this virt offset */
1528 int count = 0; 1629 int count = 0;
1630
1631 if (!bitmap_start_sync(mddev->bitmap, sector_nr,
1632 &sync_blocks, mddev->degraded) &&
1633 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
1634 /* We can skip this block */
1635 *skipped = 1;
1636 return sync_blocks + sectors_skipped;
1637 }
1638 if (sync_blocks < max_sync)
1639 max_sync = sync_blocks;
1529 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); 1640 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
1530 1641
1531 r10_bio->mddev = mddev; 1642 r10_bio->mddev = mddev;
1532 atomic_set(&r10_bio->remaining, 0); 1643 atomic_set(&r10_bio->remaining, 0);
1644 raise_barrier(conf, 0);
1645 conf->next_resync = sector_nr;
1533 1646
1534 r10_bio->master_bio = NULL; 1647 r10_bio->master_bio = NULL;
1535 r10_bio->sector = sector_nr; 1648 r10_bio->sector = sector_nr;
@@ -1582,6 +1695,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1582 } 1695 }
1583 1696
1584 nr_sectors = 0; 1697 nr_sectors = 0;
1698 if (sector_nr + max_sync < max_sector)
1699 max_sector = sector_nr + max_sync;
1585 do { 1700 do {
1586 struct page *page; 1701 struct page *page;
1587 int len = PAGE_SIZE; 1702 int len = PAGE_SIZE;
@@ -1821,6 +1936,26 @@ static int stop(mddev_t *mddev)
1821 return 0; 1936 return 0;
1822} 1937}
1823 1938
1939static void raid10_quiesce(mddev_t *mddev, int state)
1940{
1941 conf_t *conf = mddev_to_conf(mddev);
1942
1943 switch(state) {
1944 case 1:
1945 raise_barrier(conf, 0);
1946 break;
1947 case 0:
1948 lower_barrier(conf);
1949 break;
1950 }
1951 if (mddev->thread) {
1952 if (mddev->bitmap)
1953 mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
1954 else
1955 mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
1956 md_wakeup_thread(mddev->thread);
1957 }
1958}
1824 1959
1825static mdk_personality_t raid10_personality = 1960static mdk_personality_t raid10_personality =
1826{ 1961{
@@ -1835,6 +1970,7 @@ static mdk_personality_t raid10_personality =
1835 .hot_remove_disk= raid10_remove_disk, 1970 .hot_remove_disk= raid10_remove_disk,
1836 .spare_active = raid10_spare_active, 1971 .spare_active = raid10_spare_active,
1837 .sync_request = sync_request, 1972 .sync_request = sync_request,
1973 .quiesce = raid10_quiesce,
1838}; 1974};
1839 1975
1840static int __init raid_init(void) 1976static int __init raid_init(void)