aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.de>2006-03-27 04:18:09 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2006-03-27 11:45:01 -0500
commitccfcc3c10b2a5cb8fd3c918199a4ff904fc6fb3e (patch)
tree0c86fe6ba0f0bafd76f86f81b7425d6d497664aa /drivers/md
parent7ecaa1e6a1ad69862e9980b6c777e11f26c4782d (diff)
[PATCH] md: Core of raid5 resize process
This patch provides the core of the resize/expand process. sync_request notices if a 'reshape' is happening and acts accordingly. It allocated new stripe_heads for the next chunk-wide-stripe in the target geometry, marking them STRIPE_EXPANDING. Then it finds which stripe heads in the old geometry can provide data needed by these and marks them STRIPE_EXPAND_SOURCE. This causes stripe_handle to read all blocks on those stripes. Once all blocks on a STRIPE_EXPAND_SOURCE stripe_head are read, any that are needed are copied into the corresponding STRIPE_EXPANDING stripe_head. Once a STRIPE_EXPANDING stripe_head is full, it is marks STRIPE_EXPAND_READY and then is written out and released. Signed-off-by: Neil Brown <neilb@suse.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/md.c14
-rw-r--r--drivers/md/raid5.c185
2 files changed, 174 insertions, 25 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index c7b7656f9aa5..8e65986bc63f 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2165,7 +2165,9 @@ action_show(mddev_t *mddev, char *page)
2165 char *type = "idle"; 2165 char *type = "idle";
2166 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 2166 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
2167 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) { 2167 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) {
2168 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 2168 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2169 type = "reshape";
2170 else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2169 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 2171 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
2170 type = "resync"; 2172 type = "resync";
2171 else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 2173 else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
@@ -4088,8 +4090,10 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev)
4088 seq_printf(seq, "] "); 4090 seq_printf(seq, "] ");
4089 } 4091 }
4090 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", 4092 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
4093 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
4094 "reshape" :
4091 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 4095 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
4092 "resync" : "recovery"), 4096 "resync" : "recovery")),
4093 per_milli/10, per_milli % 10, 4097 per_milli/10, per_milli % 10,
4094 (unsigned long long) resync, 4098 (unsigned long long) resync,
4095 (unsigned long long) max_blocks); 4099 (unsigned long long) max_blocks);
@@ -4543,7 +4547,9 @@ static void md_do_sync(mddev_t *mddev)
4543 */ 4547 */
4544 max_sectors = mddev->resync_max_sectors; 4548 max_sectors = mddev->resync_max_sectors;
4545 mddev->resync_mismatches = 0; 4549 mddev->resync_mismatches = 0;
4546 } else 4550 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
4551 max_sectors = mddev->size << 1;
4552 else
4547 /* recovery follows the physical size of devices */ 4553 /* recovery follows the physical size of devices */
4548 max_sectors = mddev->size << 1; 4554 max_sectors = mddev->size << 1;
4549 4555
@@ -4679,6 +4685,8 @@ static void md_do_sync(mddev_t *mddev)
4679 mddev->pers->sync_request(mddev, max_sectors, &skipped, 1); 4685 mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
4680 4686
4681 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && 4687 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
4688 test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
4689 !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
4682 mddev->curr_resync > 2 && 4690 mddev->curr_resync > 2 &&
4683 mddev->curr_resync >= mddev->recovery_cp) { 4691 mddev->curr_resync >= mddev->recovery_cp) {
4684 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 4692 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7a6df515b008..56cba8d3e398 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -93,11 +93,11 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
93 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) 93 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
94 md_wakeup_thread(conf->mddev->thread); 94 md_wakeup_thread(conf->mddev->thread);
95 } 95 }
96 list_add_tail(&sh->lru, &conf->inactive_list);
97 atomic_dec(&conf->active_stripes); 96 atomic_dec(&conf->active_stripes);
98 if (!conf->inactive_blocked || 97 if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
99 atomic_read(&conf->active_stripes) < (conf->max_nr_stripes*3/4)) 98 list_add_tail(&sh->lru, &conf->inactive_list);
100 wake_up(&conf->wait_for_stripe); 99 wake_up(&conf->wait_for_stripe);
100 }
101 } 101 }
102 } 102 }
103} 103}
@@ -273,9 +273,8 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
273 } else { 273 } else {
274 if (!test_bit(STRIPE_HANDLE, &sh->state)) 274 if (!test_bit(STRIPE_HANDLE, &sh->state))
275 atomic_inc(&conf->active_stripes); 275 atomic_inc(&conf->active_stripes);
276 if (list_empty(&sh->lru)) 276 if (!list_empty(&sh->lru))
277 BUG(); 277 list_del_init(&sh->lru);
278 list_del_init(&sh->lru);
279 } 278 }
280 } 279 }
281 } while (sh == NULL); 280 } while (sh == NULL);
@@ -1035,6 +1034,18 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
1035 return 0; 1034 return 0;
1036} 1035}
1037 1036
1037static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
1038{
1039 int sectors_per_chunk = conf->chunk_size >> 9;
1040 sector_t x = stripe;
1041 int pd_idx, dd_idx;
1042 int chunk_offset = sector_div(x, sectors_per_chunk);
1043 stripe = x;
1044 raid5_compute_sector(stripe*(disks-1)*sectors_per_chunk
1045 + chunk_offset, disks, disks-1, &dd_idx, &pd_idx, conf);
1046 return pd_idx;
1047}
1048
1038 1049
1039/* 1050/*
1040 * handle_stripe - do things to a stripe. 1051 * handle_stripe - do things to a stripe.
@@ -1061,7 +1072,7 @@ static void handle_stripe(struct stripe_head *sh)
1061 struct bio *return_bi= NULL; 1072 struct bio *return_bi= NULL;
1062 struct bio *bi; 1073 struct bio *bi;
1063 int i; 1074 int i;
1064 int syncing; 1075 int syncing, expanding, expanded;
1065 int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0; 1076 int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
1066 int non_overwrite = 0; 1077 int non_overwrite = 0;
1067 int failed_num=0; 1078 int failed_num=0;
@@ -1076,6 +1087,8 @@ static void handle_stripe(struct stripe_head *sh)
1076 clear_bit(STRIPE_DELAYED, &sh->state); 1087 clear_bit(STRIPE_DELAYED, &sh->state);
1077 1088
1078 syncing = test_bit(STRIPE_SYNCING, &sh->state); 1089 syncing = test_bit(STRIPE_SYNCING, &sh->state);
1090 expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
1091 expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
1079 /* Now to look around and see what can be done */ 1092 /* Now to look around and see what can be done */
1080 1093
1081 rcu_read_lock(); 1094 rcu_read_lock();
@@ -1268,13 +1281,14 @@ static void handle_stripe(struct stripe_head *sh)
1268 * parity, or to satisfy requests 1281 * parity, or to satisfy requests
1269 * or to load a block that is being partially written. 1282 * or to load a block that is being partially written.
1270 */ 1283 */
1271 if (to_read || non_overwrite || (syncing && (uptodate < disks))) { 1284 if (to_read || non_overwrite || (syncing && (uptodate < disks)) || expanding) {
1272 for (i=disks; i--;) { 1285 for (i=disks; i--;) {
1273 dev = &sh->dev[i]; 1286 dev = &sh->dev[i];
1274 if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && 1287 if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
1275 (dev->toread || 1288 (dev->toread ||
1276 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || 1289 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
1277 syncing || 1290 syncing ||
1291 expanding ||
1278 (failed && (sh->dev[failed_num].toread || 1292 (failed && (sh->dev[failed_num].toread ||
1279 (sh->dev[failed_num].towrite && !test_bit(R5_OVERWRITE, &sh->dev[failed_num].flags)))) 1293 (sh->dev[failed_num].towrite && !test_bit(R5_OVERWRITE, &sh->dev[failed_num].flags))))
1280 ) 1294 )
@@ -1464,13 +1478,76 @@ static void handle_stripe(struct stripe_head *sh)
1464 set_bit(R5_Wantwrite, &dev->flags); 1478 set_bit(R5_Wantwrite, &dev->flags);
1465 set_bit(R5_ReWrite, &dev->flags); 1479 set_bit(R5_ReWrite, &dev->flags);
1466 set_bit(R5_LOCKED, &dev->flags); 1480 set_bit(R5_LOCKED, &dev->flags);
1481 locked++;
1467 } else { 1482 } else {
1468 /* let's read it back */ 1483 /* let's read it back */
1469 set_bit(R5_Wantread, &dev->flags); 1484 set_bit(R5_Wantread, &dev->flags);
1470 set_bit(R5_LOCKED, &dev->flags); 1485 set_bit(R5_LOCKED, &dev->flags);
1486 locked++;
1471 } 1487 }
1472 } 1488 }
1473 1489
1490 if (expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
1491 /* Need to write out all blocks after computing parity */
1492 sh->disks = conf->raid_disks;
1493 sh->pd_idx = stripe_to_pdidx(sh->sector, conf, conf->raid_disks);
1494 compute_parity(sh, RECONSTRUCT_WRITE);
1495 for (i= conf->raid_disks; i--;) {
1496 set_bit(R5_LOCKED, &sh->dev[i].flags);
1497 locked++;
1498 set_bit(R5_Wantwrite, &sh->dev[i].flags);
1499 }
1500 clear_bit(STRIPE_EXPANDING, &sh->state);
1501 } else if (expanded) {
1502 clear_bit(STRIPE_EXPAND_READY, &sh->state);
1503 wake_up(&conf->wait_for_overlap);
1504 md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
1505 }
1506
1507 if (expanding && locked == 0) {
1508 /* We have read all the blocks in this stripe and now we need to
1509 * copy some of them into a target stripe for expand.
1510 */
1511 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
1512 for (i=0; i< sh->disks; i++)
1513 if (i != sh->pd_idx) {
1514 int dd_idx, pd_idx, j;
1515 struct stripe_head *sh2;
1516
1517 sector_t bn = compute_blocknr(sh, i);
1518 sector_t s = raid5_compute_sector(bn, conf->raid_disks,
1519 conf->raid_disks-1,
1520 &dd_idx, &pd_idx, conf);
1521 sh2 = get_active_stripe(conf, s, conf->raid_disks, pd_idx, 1);
1522 if (sh2 == NULL)
1523 /* so far only the early blocks of this stripe
1524 * have been requested. When later blocks
1525 * get requested, we will try again
1526 */
1527 continue;
1528 if(!test_bit(STRIPE_EXPANDING, &sh2->state) ||
1529 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
1530 /* must have already done this block */
1531 release_stripe(sh2);
1532 continue;
1533 }
1534 memcpy(page_address(sh2->dev[dd_idx].page),
1535 page_address(sh->dev[i].page),
1536 STRIPE_SIZE);
1537 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
1538 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
1539 for (j=0; j<conf->raid_disks; j++)
1540 if (j != sh2->pd_idx &&
1541 !test_bit(R5_Expanded, &sh2->dev[j].flags))
1542 break;
1543 if (j == conf->raid_disks) {
1544 set_bit(STRIPE_EXPAND_READY, &sh2->state);
1545 set_bit(STRIPE_HANDLE, &sh2->state);
1546 }
1547 release_stripe(sh2);
1548 }
1549 }
1550
1474 spin_unlock(&sh->lock); 1551 spin_unlock(&sh->lock);
1475 1552
1476 while ((bi=return_bi)) { 1553 while ((bi=return_bi)) {
@@ -1509,7 +1586,7 @@ static void handle_stripe(struct stripe_head *sh)
1509 rcu_read_unlock(); 1586 rcu_read_unlock();
1510 1587
1511 if (rdev) { 1588 if (rdev) {
1512 if (syncing) 1589 if (syncing || expanding || expanded)
1513 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 1590 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
1514 1591
1515 bi->bi_bdev = rdev->bdev; 1592 bi->bi_bdev = rdev->bdev;
@@ -1757,12 +1834,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1757{ 1834{
1758 raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 1835 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1759 struct stripe_head *sh; 1836 struct stripe_head *sh;
1760 int sectors_per_chunk = conf->chunk_size >> 9; 1837 int pd_idx;
1761 sector_t x; 1838 sector_t first_sector, last_sector;
1762 unsigned long stripe;
1763 int chunk_offset;
1764 int dd_idx, pd_idx;
1765 sector_t first_sector;
1766 int raid_disks = conf->raid_disks; 1839 int raid_disks = conf->raid_disks;
1767 int data_disks = raid_disks-1; 1840 int data_disks = raid_disks-1;
1768 sector_t max_sector = mddev->size << 1; 1841 sector_t max_sector = mddev->size << 1;
@@ -1781,6 +1854,80 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1781 1854
1782 return 0; 1855 return 0;
1783 } 1856 }
1857
1858 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
1859 /* reshaping is quite different to recovery/resync so it is
1860 * handled quite separately ... here.
1861 *
1862 * On each call to sync_request, we gather one chunk worth of
1863 * destination stripes and flag them as expanding.
1864 * Then we find all the source stripes and request reads.
1865 * As the reads complete, handle_stripe will copy the data
1866 * into the destination stripe and release that stripe.
1867 */
1868 int i;
1869 int dd_idx;
1870 for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) {
1871 int j;
1872 int skipped = 0;
1873 pd_idx = stripe_to_pdidx(sector_nr+i, conf, conf->raid_disks);
1874 sh = get_active_stripe(conf, sector_nr+i,
1875 conf->raid_disks, pd_idx, 0);
1876 set_bit(STRIPE_EXPANDING, &sh->state);
1877 /* If any of this stripe is beyond the end of the old
1878 * array, then we need to zero those blocks
1879 */
1880 for (j=sh->disks; j--;) {
1881 sector_t s;
1882 if (j == sh->pd_idx)
1883 continue;
1884 s = compute_blocknr(sh, j);
1885 if (s < (mddev->array_size<<1)) {
1886 skipped = 1;
1887 continue;
1888 }
1889 memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE);
1890 set_bit(R5_Expanded, &sh->dev[j].flags);
1891 set_bit(R5_UPTODATE, &sh->dev[j].flags);
1892 }
1893 if (!skipped) {
1894 set_bit(STRIPE_EXPAND_READY, &sh->state);
1895 set_bit(STRIPE_HANDLE, &sh->state);
1896 }
1897 release_stripe(sh);
1898 }
1899 spin_lock_irq(&conf->device_lock);
1900 conf->expand_progress = (sector_nr + i)*(conf->raid_disks-1);
1901 spin_unlock_irq(&conf->device_lock);
1902 /* Ok, those stripe are ready. We can start scheduling
1903 * reads on the source stripes.
1904 * The source stripes are determined by mapping the first and last
1905 * block on the destination stripes.
1906 */
1907 raid_disks = conf->previous_raid_disks;
1908 data_disks = raid_disks - 1;
1909 first_sector =
1910 raid5_compute_sector(sector_nr*(conf->raid_disks-1),
1911 raid_disks, data_disks,
1912 &dd_idx, &pd_idx, conf);
1913 last_sector =
1914 raid5_compute_sector((sector_nr+conf->chunk_size/512)
1915 *(conf->raid_disks-1) -1,
1916 raid_disks, data_disks,
1917 &dd_idx, &pd_idx, conf);
1918 if (last_sector >= (mddev->size<<1))
1919 last_sector = (mddev->size<<1)-1;
1920 while (first_sector <= last_sector) {
1921 pd_idx = stripe_to_pdidx(first_sector, conf, conf->previous_raid_disks);
1922 sh = get_active_stripe(conf, first_sector,
1923 conf->previous_raid_disks, pd_idx, 0);
1924 set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
1925 set_bit(STRIPE_HANDLE, &sh->state);
1926 release_stripe(sh);
1927 first_sector += STRIPE_SECTORS;
1928 }
1929 return conf->chunk_size>>9;
1930 }
1784 /* if there is 1 or more failed drives and we are trying 1931 /* if there is 1 or more failed drives and we are trying
1785 * to resync, then assert that we are finished, because there is 1932 * to resync, then assert that we are finished, because there is
1786 * nothing we can do. 1933 * nothing we can do.
@@ -1799,13 +1946,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1799 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ 1946 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
1800 } 1947 }
1801 1948
1802 x = sector_nr; 1949 pd_idx = stripe_to_pdidx(sector_nr, conf, raid_disks);
1803 chunk_offset = sector_div(x, sectors_per_chunk);
1804 stripe = x;
1805 BUG_ON(x != stripe);
1806
1807 first_sector = raid5_compute_sector((sector_t)stripe*data_disks*sectors_per_chunk
1808 + chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf);
1809 sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 1); 1950 sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 1);
1810 if (sh == NULL) { 1951 if (sh == NULL) {
1811 sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 0); 1952 sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 0);