diff options
author | NeilBrown <neilb@suse.de> | 2006-03-27 04:18:09 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2006-03-27 11:45:01 -0500 |
commit | ccfcc3c10b2a5cb8fd3c918199a4ff904fc6fb3e (patch) | |
tree | 0c86fe6ba0f0bafd76f86f81b7425d6d497664aa /drivers/md/raid5.c | |
parent | 7ecaa1e6a1ad69862e9980b6c777e11f26c4782d (diff) |
[PATCH] md: Core of raid5 resize process
This patch provides the core of the resize/expand process.
sync_request notices if a 'reshape' is happening and acts accordingly.
It allocated new stripe_heads for the next chunk-wide-stripe in the target
geometry, marking them STRIPE_EXPANDING.
Then it finds which stripe heads in the old geometry can provide data needed
by these and marks them STRIPE_EXPAND_SOURCE. This causes stripe_handle to
read all blocks on those stripes.
Once all blocks on a STRIPE_EXPAND_SOURCE stripe_head are read, any that are
needed are copied into the corresponding STRIPE_EXPANDING stripe_head. Once a
STRIPE_EXPANDING stripe_head is full, it is marks STRIPE_EXPAND_READY and then
is written out and released.
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r-- | drivers/md/raid5.c | 185 |
1 files changed, 163 insertions, 22 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 7a6df515b008..56cba8d3e398 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -93,11 +93,11 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) | |||
93 | if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) | 93 | if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) |
94 | md_wakeup_thread(conf->mddev->thread); | 94 | md_wakeup_thread(conf->mddev->thread); |
95 | } | 95 | } |
96 | list_add_tail(&sh->lru, &conf->inactive_list); | ||
97 | atomic_dec(&conf->active_stripes); | 96 | atomic_dec(&conf->active_stripes); |
98 | if (!conf->inactive_blocked || | 97 | if (!test_bit(STRIPE_EXPANDING, &sh->state)) { |
99 | atomic_read(&conf->active_stripes) < (conf->max_nr_stripes*3/4)) | 98 | list_add_tail(&sh->lru, &conf->inactive_list); |
100 | wake_up(&conf->wait_for_stripe); | 99 | wake_up(&conf->wait_for_stripe); |
100 | } | ||
101 | } | 101 | } |
102 | } | 102 | } |
103 | } | 103 | } |
@@ -273,9 +273,8 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector | |||
273 | } else { | 273 | } else { |
274 | if (!test_bit(STRIPE_HANDLE, &sh->state)) | 274 | if (!test_bit(STRIPE_HANDLE, &sh->state)) |
275 | atomic_inc(&conf->active_stripes); | 275 | atomic_inc(&conf->active_stripes); |
276 | if (list_empty(&sh->lru)) | 276 | if (!list_empty(&sh->lru)) |
277 | BUG(); | 277 | list_del_init(&sh->lru); |
278 | list_del_init(&sh->lru); | ||
279 | } | 278 | } |
280 | } | 279 | } |
281 | } while (sh == NULL); | 280 | } while (sh == NULL); |
@@ -1035,6 +1034,18 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
1035 | return 0; | 1034 | return 0; |
1036 | } | 1035 | } |
1037 | 1036 | ||
1037 | static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks) | ||
1038 | { | ||
1039 | int sectors_per_chunk = conf->chunk_size >> 9; | ||
1040 | sector_t x = stripe; | ||
1041 | int pd_idx, dd_idx; | ||
1042 | int chunk_offset = sector_div(x, sectors_per_chunk); | ||
1043 | stripe = x; | ||
1044 | raid5_compute_sector(stripe*(disks-1)*sectors_per_chunk | ||
1045 | + chunk_offset, disks, disks-1, &dd_idx, &pd_idx, conf); | ||
1046 | return pd_idx; | ||
1047 | } | ||
1048 | |||
1038 | 1049 | ||
1039 | /* | 1050 | /* |
1040 | * handle_stripe - do things to a stripe. | 1051 | * handle_stripe - do things to a stripe. |
@@ -1061,7 +1072,7 @@ static void handle_stripe(struct stripe_head *sh) | |||
1061 | struct bio *return_bi= NULL; | 1072 | struct bio *return_bi= NULL; |
1062 | struct bio *bi; | 1073 | struct bio *bi; |
1063 | int i; | 1074 | int i; |
1064 | int syncing; | 1075 | int syncing, expanding, expanded; |
1065 | int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0; | 1076 | int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0; |
1066 | int non_overwrite = 0; | 1077 | int non_overwrite = 0; |
1067 | int failed_num=0; | 1078 | int failed_num=0; |
@@ -1076,6 +1087,8 @@ static void handle_stripe(struct stripe_head *sh) | |||
1076 | clear_bit(STRIPE_DELAYED, &sh->state); | 1087 | clear_bit(STRIPE_DELAYED, &sh->state); |
1077 | 1088 | ||
1078 | syncing = test_bit(STRIPE_SYNCING, &sh->state); | 1089 | syncing = test_bit(STRIPE_SYNCING, &sh->state); |
1090 | expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); | ||
1091 | expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); | ||
1079 | /* Now to look around and see what can be done */ | 1092 | /* Now to look around and see what can be done */ |
1080 | 1093 | ||
1081 | rcu_read_lock(); | 1094 | rcu_read_lock(); |
@@ -1268,13 +1281,14 @@ static void handle_stripe(struct stripe_head *sh) | |||
1268 | * parity, or to satisfy requests | 1281 | * parity, or to satisfy requests |
1269 | * or to load a block that is being partially written. | 1282 | * or to load a block that is being partially written. |
1270 | */ | 1283 | */ |
1271 | if (to_read || non_overwrite || (syncing && (uptodate < disks))) { | 1284 | if (to_read || non_overwrite || (syncing && (uptodate < disks)) || expanding) { |
1272 | for (i=disks; i--;) { | 1285 | for (i=disks; i--;) { |
1273 | dev = &sh->dev[i]; | 1286 | dev = &sh->dev[i]; |
1274 | if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && | 1287 | if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && |
1275 | (dev->toread || | 1288 | (dev->toread || |
1276 | (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || | 1289 | (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || |
1277 | syncing || | 1290 | syncing || |
1291 | expanding || | ||
1278 | (failed && (sh->dev[failed_num].toread || | 1292 | (failed && (sh->dev[failed_num].toread || |
1279 | (sh->dev[failed_num].towrite && !test_bit(R5_OVERWRITE, &sh->dev[failed_num].flags)))) | 1293 | (sh->dev[failed_num].towrite && !test_bit(R5_OVERWRITE, &sh->dev[failed_num].flags)))) |
1280 | ) | 1294 | ) |
@@ -1464,13 +1478,76 @@ static void handle_stripe(struct stripe_head *sh) | |||
1464 | set_bit(R5_Wantwrite, &dev->flags); | 1478 | set_bit(R5_Wantwrite, &dev->flags); |
1465 | set_bit(R5_ReWrite, &dev->flags); | 1479 | set_bit(R5_ReWrite, &dev->flags); |
1466 | set_bit(R5_LOCKED, &dev->flags); | 1480 | set_bit(R5_LOCKED, &dev->flags); |
1481 | locked++; | ||
1467 | } else { | 1482 | } else { |
1468 | /* let's read it back */ | 1483 | /* let's read it back */ |
1469 | set_bit(R5_Wantread, &dev->flags); | 1484 | set_bit(R5_Wantread, &dev->flags); |
1470 | set_bit(R5_LOCKED, &dev->flags); | 1485 | set_bit(R5_LOCKED, &dev->flags); |
1486 | locked++; | ||
1471 | } | 1487 | } |
1472 | } | 1488 | } |
1473 | 1489 | ||
1490 | if (expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { | ||
1491 | /* Need to write out all blocks after computing parity */ | ||
1492 | sh->disks = conf->raid_disks; | ||
1493 | sh->pd_idx = stripe_to_pdidx(sh->sector, conf, conf->raid_disks); | ||
1494 | compute_parity(sh, RECONSTRUCT_WRITE); | ||
1495 | for (i= conf->raid_disks; i--;) { | ||
1496 | set_bit(R5_LOCKED, &sh->dev[i].flags); | ||
1497 | locked++; | ||
1498 | set_bit(R5_Wantwrite, &sh->dev[i].flags); | ||
1499 | } | ||
1500 | clear_bit(STRIPE_EXPANDING, &sh->state); | ||
1501 | } else if (expanded) { | ||
1502 | clear_bit(STRIPE_EXPAND_READY, &sh->state); | ||
1503 | wake_up(&conf->wait_for_overlap); | ||
1504 | md_done_sync(conf->mddev, STRIPE_SECTORS, 1); | ||
1505 | } | ||
1506 | |||
1507 | if (expanding && locked == 0) { | ||
1508 | /* We have read all the blocks in this stripe and now we need to | ||
1509 | * copy some of them into a target stripe for expand. | ||
1510 | */ | ||
1511 | clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); | ||
1512 | for (i=0; i< sh->disks; i++) | ||
1513 | if (i != sh->pd_idx) { | ||
1514 | int dd_idx, pd_idx, j; | ||
1515 | struct stripe_head *sh2; | ||
1516 | |||
1517 | sector_t bn = compute_blocknr(sh, i); | ||
1518 | sector_t s = raid5_compute_sector(bn, conf->raid_disks, | ||
1519 | conf->raid_disks-1, | ||
1520 | &dd_idx, &pd_idx, conf); | ||
1521 | sh2 = get_active_stripe(conf, s, conf->raid_disks, pd_idx, 1); | ||
1522 | if (sh2 == NULL) | ||
1523 | /* so far only the early blocks of this stripe | ||
1524 | * have been requested. When later blocks | ||
1525 | * get requested, we will try again | ||
1526 | */ | ||
1527 | continue; | ||
1528 | if(!test_bit(STRIPE_EXPANDING, &sh2->state) || | ||
1529 | test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) { | ||
1530 | /* must have already done this block */ | ||
1531 | release_stripe(sh2); | ||
1532 | continue; | ||
1533 | } | ||
1534 | memcpy(page_address(sh2->dev[dd_idx].page), | ||
1535 | page_address(sh->dev[i].page), | ||
1536 | STRIPE_SIZE); | ||
1537 | set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); | ||
1538 | set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); | ||
1539 | for (j=0; j<conf->raid_disks; j++) | ||
1540 | if (j != sh2->pd_idx && | ||
1541 | !test_bit(R5_Expanded, &sh2->dev[j].flags)) | ||
1542 | break; | ||
1543 | if (j == conf->raid_disks) { | ||
1544 | set_bit(STRIPE_EXPAND_READY, &sh2->state); | ||
1545 | set_bit(STRIPE_HANDLE, &sh2->state); | ||
1546 | } | ||
1547 | release_stripe(sh2); | ||
1548 | } | ||
1549 | } | ||
1550 | |||
1474 | spin_unlock(&sh->lock); | 1551 | spin_unlock(&sh->lock); |
1475 | 1552 | ||
1476 | while ((bi=return_bi)) { | 1553 | while ((bi=return_bi)) { |
@@ -1509,7 +1586,7 @@ static void handle_stripe(struct stripe_head *sh) | |||
1509 | rcu_read_unlock(); | 1586 | rcu_read_unlock(); |
1510 | 1587 | ||
1511 | if (rdev) { | 1588 | if (rdev) { |
1512 | if (syncing) | 1589 | if (syncing || expanding || expanded) |
1513 | md_sync_acct(rdev->bdev, STRIPE_SECTORS); | 1590 | md_sync_acct(rdev->bdev, STRIPE_SECTORS); |
1514 | 1591 | ||
1515 | bi->bi_bdev = rdev->bdev; | 1592 | bi->bi_bdev = rdev->bdev; |
@@ -1757,12 +1834,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
1757 | { | 1834 | { |
1758 | raid5_conf_t *conf = (raid5_conf_t *) mddev->private; | 1835 | raid5_conf_t *conf = (raid5_conf_t *) mddev->private; |
1759 | struct stripe_head *sh; | 1836 | struct stripe_head *sh; |
1760 | int sectors_per_chunk = conf->chunk_size >> 9; | 1837 | int pd_idx; |
1761 | sector_t x; | 1838 | sector_t first_sector, last_sector; |
1762 | unsigned long stripe; | ||
1763 | int chunk_offset; | ||
1764 | int dd_idx, pd_idx; | ||
1765 | sector_t first_sector; | ||
1766 | int raid_disks = conf->raid_disks; | 1839 | int raid_disks = conf->raid_disks; |
1767 | int data_disks = raid_disks-1; | 1840 | int data_disks = raid_disks-1; |
1768 | sector_t max_sector = mddev->size << 1; | 1841 | sector_t max_sector = mddev->size << 1; |
@@ -1781,6 +1854,80 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
1781 | 1854 | ||
1782 | return 0; | 1855 | return 0; |
1783 | } | 1856 | } |
1857 | |||
1858 | if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { | ||
1859 | /* reshaping is quite different to recovery/resync so it is | ||
1860 | * handled quite separately ... here. | ||
1861 | * | ||
1862 | * On each call to sync_request, we gather one chunk worth of | ||
1863 | * destination stripes and flag them as expanding. | ||
1864 | * Then we find all the source stripes and request reads. | ||
1865 | * As the reads complete, handle_stripe will copy the data | ||
1866 | * into the destination stripe and release that stripe. | ||
1867 | */ | ||
1868 | int i; | ||
1869 | int dd_idx; | ||
1870 | for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) { | ||
1871 | int j; | ||
1872 | int skipped = 0; | ||
1873 | pd_idx = stripe_to_pdidx(sector_nr+i, conf, conf->raid_disks); | ||
1874 | sh = get_active_stripe(conf, sector_nr+i, | ||
1875 | conf->raid_disks, pd_idx, 0); | ||
1876 | set_bit(STRIPE_EXPANDING, &sh->state); | ||
1877 | /* If any of this stripe is beyond the end of the old | ||
1878 | * array, then we need to zero those blocks | ||
1879 | */ | ||
1880 | for (j=sh->disks; j--;) { | ||
1881 | sector_t s; | ||
1882 | if (j == sh->pd_idx) | ||
1883 | continue; | ||
1884 | s = compute_blocknr(sh, j); | ||
1885 | if (s < (mddev->array_size<<1)) { | ||
1886 | skipped = 1; | ||
1887 | continue; | ||
1888 | } | ||
1889 | memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE); | ||
1890 | set_bit(R5_Expanded, &sh->dev[j].flags); | ||
1891 | set_bit(R5_UPTODATE, &sh->dev[j].flags); | ||
1892 | } | ||
1893 | if (!skipped) { | ||
1894 | set_bit(STRIPE_EXPAND_READY, &sh->state); | ||
1895 | set_bit(STRIPE_HANDLE, &sh->state); | ||
1896 | } | ||
1897 | release_stripe(sh); | ||
1898 | } | ||
1899 | spin_lock_irq(&conf->device_lock); | ||
1900 | conf->expand_progress = (sector_nr + i)*(conf->raid_disks-1); | ||
1901 | spin_unlock_irq(&conf->device_lock); | ||
1902 | /* Ok, those stripe are ready. We can start scheduling | ||
1903 | * reads on the source stripes. | ||
1904 | * The source stripes are determined by mapping the first and last | ||
1905 | * block on the destination stripes. | ||
1906 | */ | ||
1907 | raid_disks = conf->previous_raid_disks; | ||
1908 | data_disks = raid_disks - 1; | ||
1909 | first_sector = | ||
1910 | raid5_compute_sector(sector_nr*(conf->raid_disks-1), | ||
1911 | raid_disks, data_disks, | ||
1912 | &dd_idx, &pd_idx, conf); | ||
1913 | last_sector = | ||
1914 | raid5_compute_sector((sector_nr+conf->chunk_size/512) | ||
1915 | *(conf->raid_disks-1) -1, | ||
1916 | raid_disks, data_disks, | ||
1917 | &dd_idx, &pd_idx, conf); | ||
1918 | if (last_sector >= (mddev->size<<1)) | ||
1919 | last_sector = (mddev->size<<1)-1; | ||
1920 | while (first_sector <= last_sector) { | ||
1921 | pd_idx = stripe_to_pdidx(first_sector, conf, conf->previous_raid_disks); | ||
1922 | sh = get_active_stripe(conf, first_sector, | ||
1923 | conf->previous_raid_disks, pd_idx, 0); | ||
1924 | set_bit(STRIPE_EXPAND_SOURCE, &sh->state); | ||
1925 | set_bit(STRIPE_HANDLE, &sh->state); | ||
1926 | release_stripe(sh); | ||
1927 | first_sector += STRIPE_SECTORS; | ||
1928 | } | ||
1929 | return conf->chunk_size>>9; | ||
1930 | } | ||
1784 | /* if there is 1 or more failed drives and we are trying | 1931 | /* if there is 1 or more failed drives and we are trying |
1785 | * to resync, then assert that we are finished, because there is | 1932 | * to resync, then assert that we are finished, because there is |
1786 | * nothing we can do. | 1933 | * nothing we can do. |
@@ -1799,13 +1946,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
1799 | return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ | 1946 | return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ |
1800 | } | 1947 | } |
1801 | 1948 | ||
1802 | x = sector_nr; | 1949 | pd_idx = stripe_to_pdidx(sector_nr, conf, raid_disks); |
1803 | chunk_offset = sector_div(x, sectors_per_chunk); | ||
1804 | stripe = x; | ||
1805 | BUG_ON(x != stripe); | ||
1806 | |||
1807 | first_sector = raid5_compute_sector((sector_t)stripe*data_disks*sectors_per_chunk | ||
1808 | + chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf); | ||
1809 | sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 1); | 1950 | sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 1); |
1810 | if (sh == NULL) { | 1951 | if (sh == NULL) { |
1811 | sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 0); | 1952 | sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 0); |