diff options
author | NeilBrown <neilb@suse.de> | 2006-01-06 03:20:17 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2006-01-06 11:34:03 -0500 |
commit | ca65b73bd9c301d243df93780f7b26579e6c9204 (patch) | |
tree | 1ca8157e11033989ee94c20b1f2d4c936f9dc09c | |
parent | 6cce3b23f6f8e974c00af7a9b88f1d413ba368a8 (diff) |
[PATCH] md: fix raid6 resync check/repair code
raid6 currently does not check the P/Q syndromes when doing a resync, it just
calculates the correct value and writes it. Doing the check can reduce writes
(often to 0) for a resync, and it is needed to properly implement the
echo check > sync_action
operation.
This patch implements the appropriate checks and tidies up some related code.
It also allows raid6 user-requested resync to bypass the intent bitmap.
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r-- | drivers/md/raid6main.c | 182 | ||||
-rw-r--r-- | include/linux/raid/raid5.h | 2 |
2 files changed, 108 insertions, 76 deletions
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c index 304455d236f9..52e8796bb8ac 100644 --- a/drivers/md/raid6main.c +++ b/drivers/md/raid6main.c | |||
@@ -805,7 +805,7 @@ static void compute_parity(struct stripe_head *sh, int method) | |||
805 | } | 805 | } |
806 | 806 | ||
807 | /* Compute one missing block */ | 807 | /* Compute one missing block */ |
808 | static void compute_block_1(struct stripe_head *sh, int dd_idx) | 808 | static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero) |
809 | { | 809 | { |
810 | raid6_conf_t *conf = sh->raid_conf; | 810 | raid6_conf_t *conf = sh->raid_conf; |
811 | int i, count, disks = conf->raid_disks; | 811 | int i, count, disks = conf->raid_disks; |
@@ -821,7 +821,7 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx) | |||
821 | compute_parity(sh, UPDATE_PARITY); | 821 | compute_parity(sh, UPDATE_PARITY); |
822 | } else { | 822 | } else { |
823 | ptr[0] = page_address(sh->dev[dd_idx].page); | 823 | ptr[0] = page_address(sh->dev[dd_idx].page); |
824 | memset(ptr[0], 0, STRIPE_SIZE); | 824 | if (!nozero) memset(ptr[0], 0, STRIPE_SIZE); |
825 | count = 1; | 825 | count = 1; |
826 | for (i = disks ; i--; ) { | 826 | for (i = disks ; i--; ) { |
827 | if (i == dd_idx || i == qd_idx) | 827 | if (i == dd_idx || i == qd_idx) |
@@ -838,7 +838,8 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx) | |||
838 | } | 838 | } |
839 | if (count != 1) | 839 | if (count != 1) |
840 | xor_block(count, STRIPE_SIZE, ptr); | 840 | xor_block(count, STRIPE_SIZE, ptr); |
841 | set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); | 841 | if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); |
842 | else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); | ||
842 | } | 843 | } |
843 | } | 844 | } |
844 | 845 | ||
@@ -871,7 +872,7 @@ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) | |||
871 | return; | 872 | return; |
872 | } else { | 873 | } else { |
873 | /* We're missing D+Q; recompute D from P */ | 874 | /* We're missing D+Q; recompute D from P */ |
874 | compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1); | 875 | compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1, 0); |
875 | compute_parity(sh, UPDATE_PARITY); /* Is this necessary? */ | 876 | compute_parity(sh, UPDATE_PARITY); /* Is this necessary? */ |
876 | return; | 877 | return; |
877 | } | 878 | } |
@@ -982,6 +983,12 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
982 | } | 983 | } |
983 | 984 | ||
984 | 985 | ||
986 | static int page_is_zero(struct page *p) | ||
987 | { | ||
988 | char *a = page_address(p); | ||
989 | return ((*(u32*)a) == 0 && | ||
990 | memcmp(a, a+4, STRIPE_SIZE-4)==0); | ||
991 | } | ||
985 | /* | 992 | /* |
986 | * handle_stripe - do things to a stripe. | 993 | * handle_stripe - do things to a stripe. |
987 | * | 994 | * |
@@ -1000,7 +1007,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
1000 | * | 1007 | * |
1001 | */ | 1008 | */ |
1002 | 1009 | ||
1003 | static void handle_stripe(struct stripe_head *sh) | 1010 | static void handle_stripe(struct stripe_head *sh, struct page *tmp_page) |
1004 | { | 1011 | { |
1005 | raid6_conf_t *conf = sh->raid_conf; | 1012 | raid6_conf_t *conf = sh->raid_conf; |
1006 | int disks = conf->raid_disks; | 1013 | int disks = conf->raid_disks; |
@@ -1228,7 +1235,7 @@ static void handle_stripe(struct stripe_head *sh) | |||
1228 | if (uptodate == disks-1) { | 1235 | if (uptodate == disks-1) { |
1229 | PRINTK("Computing stripe %llu block %d\n", | 1236 | PRINTK("Computing stripe %llu block %d\n", |
1230 | (unsigned long long)sh->sector, i); | 1237 | (unsigned long long)sh->sector, i); |
1231 | compute_block_1(sh, i); | 1238 | compute_block_1(sh, i, 0); |
1232 | uptodate++; | 1239 | uptodate++; |
1233 | } else if ( uptodate == disks-2 && failed >= 2 ) { | 1240 | } else if ( uptodate == disks-2 && failed >= 2 ) { |
1234 | /* Computing 2-failure is *very* expensive; only do it if failed >= 2 */ | 1241 | /* Computing 2-failure is *very* expensive; only do it if failed >= 2 */ |
@@ -1323,7 +1330,7 @@ static void handle_stripe(struct stripe_head *sh) | |||
1323 | /* We have failed blocks and need to compute them */ | 1330 | /* We have failed blocks and need to compute them */ |
1324 | switch ( failed ) { | 1331 | switch ( failed ) { |
1325 | case 0: BUG(); | 1332 | case 0: BUG(); |
1326 | case 1: compute_block_1(sh, failed_num[0]); break; | 1333 | case 1: compute_block_1(sh, failed_num[0], 0); break; |
1327 | case 2: compute_block_2(sh, failed_num[0], failed_num[1]); break; | 1334 | case 2: compute_block_2(sh, failed_num[0], failed_num[1]); break; |
1328 | default: BUG(); /* This request should have been failed? */ | 1335 | default: BUG(); /* This request should have been failed? */ |
1329 | } | 1336 | } |
@@ -1338,12 +1345,10 @@ static void handle_stripe(struct stripe_head *sh) | |||
1338 | (unsigned long long)sh->sector, i); | 1345 | (unsigned long long)sh->sector, i); |
1339 | locked++; | 1346 | locked++; |
1340 | set_bit(R5_Wantwrite, &sh->dev[i].flags); | 1347 | set_bit(R5_Wantwrite, &sh->dev[i].flags); |
1341 | #if 0 /**** FIX: I don't understand the logic here... ****/ | ||
1342 | if (!test_bit(R5_Insync, &sh->dev[i].flags) | ||
1343 | || ((i==pd_idx || i==qd_idx) && failed == 0)) /* FIX? */ | ||
1344 | set_bit(STRIPE_INSYNC, &sh->state); | ||
1345 | #endif | ||
1346 | } | 1348 | } |
1349 | /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */ | ||
1350 | set_bit(STRIPE_INSYNC, &sh->state); | ||
1351 | |||
1347 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { | 1352 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { |
1348 | atomic_dec(&conf->preread_active_stripes); | 1353 | atomic_dec(&conf->preread_active_stripes); |
1349 | if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) | 1354 | if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) |
@@ -1356,79 +1361,97 @@ static void handle_stripe(struct stripe_head *sh) | |||
1356 | * Any reads will already have been scheduled, so we just see if enough data | 1361 | * Any reads will already have been scheduled, so we just see if enough data |
1357 | * is available | 1362 | * is available |
1358 | */ | 1363 | */ |
1359 | if (syncing && locked == 0 && | 1364 | if (syncing && locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) { |
1360 | !test_bit(STRIPE_INSYNC, &sh->state) && failed <= 2) { | 1365 | int update_p = 0, update_q = 0; |
1361 | set_bit(STRIPE_HANDLE, &sh->state); | 1366 | struct r5dev *dev; |
1362 | #if 0 /* RAID-6: Don't support CHECK PARITY yet */ | ||
1363 | if (failed == 0) { | ||
1364 | char *pagea; | ||
1365 | if (uptodate != disks) | ||
1366 | BUG(); | ||
1367 | compute_parity(sh, CHECK_PARITY); | ||
1368 | uptodate--; | ||
1369 | pagea = page_address(sh->dev[pd_idx].page); | ||
1370 | if ((*(u32*)pagea) == 0 && | ||
1371 | !memcmp(pagea, pagea+4, STRIPE_SIZE-4)) { | ||
1372 | /* parity is correct (on disc, not in buffer any more) */ | ||
1373 | set_bit(STRIPE_INSYNC, &sh->state); | ||
1374 | } | ||
1375 | } | ||
1376 | #endif | ||
1377 | if (!test_bit(STRIPE_INSYNC, &sh->state)) { | ||
1378 | int failed_needupdate[2]; | ||
1379 | struct r5dev *adev, *bdev; | ||
1380 | |||
1381 | if ( failed < 1 ) | ||
1382 | failed_num[0] = pd_idx; | ||
1383 | if ( failed < 2 ) | ||
1384 | failed_num[1] = (failed_num[0] == qd_idx) ? pd_idx : qd_idx; | ||
1385 | 1367 | ||
1386 | failed_needupdate[0] = !test_bit(R5_UPTODATE, &sh->dev[failed_num[0]].flags); | 1368 | set_bit(STRIPE_HANDLE, &sh->state); |
1387 | failed_needupdate[1] = !test_bit(R5_UPTODATE, &sh->dev[failed_num[1]].flags); | ||
1388 | 1369 | ||
1389 | PRINTK("sync: failed=%d num=%d,%d fnu=%u%u\n", | 1370 | BUG_ON(failed>2); |
1390 | failed, failed_num[0], failed_num[1], failed_needupdate[0], failed_needupdate[1]); | 1371 | BUG_ON(uptodate < disks); |
1372 | /* Want to check and possibly repair P and Q. | ||
1373 | * However there could be one 'failed' device, in which | ||
1374 | * case we can only check one of them, possibly using the | ||
1375 | * other to generate missing data | ||
1376 | */ | ||
1391 | 1377 | ||
1392 | #if 0 /* RAID-6: This code seems to require that CHECK_PARITY destroys the uptodateness of the parity */ | 1378 | /* If !tmp_page, we cannot do the calculations, |
1393 | /* should be able to compute the missing block(s) and write to spare */ | 1379 | * but as we have set STRIPE_HANDLE, we will soon be called |
1394 | if ( failed_needupdate[0] ^ failed_needupdate[1] ) { | 1380 | * by stripe_handle with a tmp_page - just wait until then. |
1395 | if (uptodate+1 != disks) | 1381 | */ |
1396 | BUG(); | 1382 | if (tmp_page) { |
1397 | compute_block_1(sh, failed_needupdate[0] ? failed_num[0] : failed_num[1]); | 1383 | if (failed == q_failed) { |
1398 | uptodate++; | 1384 | /* The only possible failed device holds 'Q', so it makes |
1399 | } else if ( failed_needupdate[0] & failed_needupdate[1] ) { | 1385 | * sense to check P (If anything else were failed, we would |
1400 | if (uptodate+2 != disks) | 1386 | * have used P to recreate it). |
1401 | BUG(); | 1387 | */ |
1402 | compute_block_2(sh, failed_num[0], failed_num[1]); | 1388 | compute_block_1(sh, pd_idx, 1); |
1403 | uptodate += 2; | 1389 | if (!page_is_zero(sh->dev[pd_idx].page)) { |
1390 | compute_block_1(sh,pd_idx,0); | ||
1391 | update_p = 1; | ||
1392 | } | ||
1393 | } | ||
1394 | if (!q_failed && failed < 2) { | ||
1395 | /* q is not failed, and we didn't use it to generate | ||
1396 | * anything, so it makes sense to check it | ||
1397 | */ | ||
1398 | memcpy(page_address(tmp_page), | ||
1399 | page_address(sh->dev[qd_idx].page), | ||
1400 | STRIPE_SIZE); | ||
1401 | compute_parity(sh, UPDATE_PARITY); | ||
1402 | if (memcmp(page_address(tmp_page), | ||
1403 | page_address(sh->dev[qd_idx].page), | ||
1404 | STRIPE_SIZE)!= 0) { | ||
1405 | clear_bit(STRIPE_INSYNC, &sh->state); | ||
1406 | update_q = 1; | ||
1407 | } | ||
1408 | } | ||
1409 | if (update_p || update_q) { | ||
1410 | conf->mddev->resync_mismatches += STRIPE_SECTORS; | ||
1411 | if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) | ||
1412 | /* don't try to repair!! */ | ||
1413 | update_p = update_q = 0; | ||
1404 | } | 1414 | } |
1405 | #else | ||
1406 | compute_block_2(sh, failed_num[0], failed_num[1]); | ||
1407 | uptodate += failed_needupdate[0] + failed_needupdate[1]; | ||
1408 | #endif | ||
1409 | 1415 | ||
1410 | if (uptodate != disks) | 1416 | /* now write out any block on a failed drive, |
1411 | BUG(); | 1417 | * or P or Q if they need it |
1418 | */ | ||
1412 | 1419 | ||
1413 | PRINTK("Marking for sync stripe %llu blocks %d,%d\n", | 1420 | if (failed == 2) { |
1414 | (unsigned long long)sh->sector, failed_num[0], failed_num[1]); | 1421 | dev = &sh->dev[failed_num[1]]; |
1422 | locked++; | ||
1423 | set_bit(R5_LOCKED, &dev->flags); | ||
1424 | set_bit(R5_Wantwrite, &dev->flags); | ||
1425 | set_bit(R5_Syncio, &dev->flags); | ||
1426 | } | ||
1427 | if (failed >= 1) { | ||
1428 | dev = &sh->dev[failed_num[0]]; | ||
1429 | locked++; | ||
1430 | set_bit(R5_LOCKED, &dev->flags); | ||
1431 | set_bit(R5_Wantwrite, &dev->flags); | ||
1432 | set_bit(R5_Syncio, &dev->flags); | ||
1433 | } | ||
1415 | 1434 | ||
1416 | /**** FIX: Should we really do both of these unconditionally? ****/ | 1435 | if (update_p) { |
1417 | adev = &sh->dev[failed_num[0]]; | 1436 | dev = &sh->dev[pd_idx]; |
1418 | locked += !test_bit(R5_LOCKED, &adev->flags); | 1437 | locked ++; |
1419 | set_bit(R5_LOCKED, &adev->flags); | 1438 | set_bit(R5_LOCKED, &dev->flags); |
1420 | set_bit(R5_Wantwrite, &adev->flags); | 1439 | set_bit(R5_Wantwrite, &dev->flags); |
1421 | bdev = &sh->dev[failed_num[1]]; | 1440 | set_bit(R5_Syncio, &dev->flags); |
1422 | locked += !test_bit(R5_LOCKED, &bdev->flags); | 1441 | } |
1423 | set_bit(R5_LOCKED, &bdev->flags); | 1442 | if (update_q) { |
1443 | dev = &sh->dev[qd_idx]; | ||
1444 | locked++; | ||
1445 | set_bit(R5_LOCKED, &dev->flags); | ||
1446 | set_bit(R5_Wantwrite, &dev->flags); | ||
1447 | set_bit(R5_Syncio, &dev->flags); | ||
1448 | } | ||
1424 | clear_bit(STRIPE_DEGRADED, &sh->state); | 1449 | clear_bit(STRIPE_DEGRADED, &sh->state); |
1425 | set_bit(R5_Wantwrite, &bdev->flags); | ||
1426 | 1450 | ||
1427 | set_bit(STRIPE_INSYNC, &sh->state); | 1451 | set_bit(STRIPE_INSYNC, &sh->state); |
1428 | set_bit(R5_Syncio, &adev->flags); | ||
1429 | set_bit(R5_Syncio, &bdev->flags); | ||
1430 | } | 1452 | } |
1431 | } | 1453 | } |
1454 | |||
1432 | if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { | 1455 | if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { |
1433 | md_done_sync(conf->mddev, STRIPE_SECTORS,1); | 1456 | md_done_sync(conf->mddev, STRIPE_SECTORS,1); |
1434 | clear_bit(STRIPE_SYNCING, &sh->state); | 1457 | clear_bit(STRIPE_SYNCING, &sh->state); |
@@ -1664,7 +1687,7 @@ static int make_request (request_queue_t *q, struct bio * bi) | |||
1664 | } | 1687 | } |
1665 | finish_wait(&conf->wait_for_overlap, &w); | 1688 | finish_wait(&conf->wait_for_overlap, &w); |
1666 | raid6_plug_device(conf); | 1689 | raid6_plug_device(conf); |
1667 | handle_stripe(sh); | 1690 | handle_stripe(sh, NULL); |
1668 | release_stripe(sh); | 1691 | release_stripe(sh); |
1669 | } else { | 1692 | } else { |
1670 | /* cannot get stripe for read-ahead, just give-up */ | 1693 | /* cannot get stripe for read-ahead, just give-up */ |
@@ -1728,6 +1751,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
1728 | return rv; | 1751 | return rv; |
1729 | } | 1752 | } |
1730 | if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && | 1753 | if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && |
1754 | !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && | ||
1731 | !conf->fullsync && sync_blocks >= STRIPE_SECTORS) { | 1755 | !conf->fullsync && sync_blocks >= STRIPE_SECTORS) { |
1732 | /* we can skip this block, and probably more */ | 1756 | /* we can skip this block, and probably more */ |
1733 | sync_blocks /= STRIPE_SECTORS; | 1757 | sync_blocks /= STRIPE_SECTORS; |
@@ -1765,7 +1789,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
1765 | clear_bit(STRIPE_INSYNC, &sh->state); | 1789 | clear_bit(STRIPE_INSYNC, &sh->state); |
1766 | spin_unlock(&sh->lock); | 1790 | spin_unlock(&sh->lock); |
1767 | 1791 | ||
1768 | handle_stripe(sh); | 1792 | handle_stripe(sh, NULL); |
1769 | release_stripe(sh); | 1793 | release_stripe(sh); |
1770 | 1794 | ||
1771 | return STRIPE_SECTORS; | 1795 | return STRIPE_SECTORS; |
@@ -1821,7 +1845,7 @@ static void raid6d (mddev_t *mddev) | |||
1821 | spin_unlock_irq(&conf->device_lock); | 1845 | spin_unlock_irq(&conf->device_lock); |
1822 | 1846 | ||
1823 | handled++; | 1847 | handled++; |
1824 | handle_stripe(sh); | 1848 | handle_stripe(sh, conf->spare_page); |
1825 | release_stripe(sh); | 1849 | release_stripe(sh); |
1826 | 1850 | ||
1827 | spin_lock_irq(&conf->device_lock); | 1851 | spin_lock_irq(&conf->device_lock); |
@@ -1860,6 +1884,10 @@ static int run(mddev_t *mddev) | |||
1860 | goto abort; | 1884 | goto abort; |
1861 | memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE); | 1885 | memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE); |
1862 | 1886 | ||
1887 | conf->spare_page = alloc_page(GFP_KERNEL); | ||
1888 | if (!conf->spare_page) | ||
1889 | goto abort; | ||
1890 | |||
1863 | spin_lock_init(&conf->device_lock); | 1891 | spin_lock_init(&conf->device_lock); |
1864 | init_waitqueue_head(&conf->wait_for_stripe); | 1892 | init_waitqueue_head(&conf->wait_for_stripe); |
1865 | init_waitqueue_head(&conf->wait_for_overlap); | 1893 | init_waitqueue_head(&conf->wait_for_overlap); |
@@ -1996,6 +2024,8 @@ static int run(mddev_t *mddev) | |||
1996 | abort: | 2024 | abort: |
1997 | if (conf) { | 2025 | if (conf) { |
1998 | print_raid6_conf(conf); | 2026 | print_raid6_conf(conf); |
2027 | if (conf->spare_page) | ||
2028 | page_cache_release(conf->spare_page); | ||
1999 | if (conf->stripe_hashtbl) | 2029 | if (conf->stripe_hashtbl) |
2000 | free_pages((unsigned long) conf->stripe_hashtbl, | 2030 | free_pages((unsigned long) conf->stripe_hashtbl, |
2001 | HASH_PAGES_ORDER); | 2031 | HASH_PAGES_ORDER); |
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h index f025ba6fb14c..e9c1c0d4f90b 100644 --- a/include/linux/raid/raid5.h +++ b/include/linux/raid/raid5.h | |||
@@ -228,6 +228,8 @@ struct raid5_private_data { | |||
228 | * Cleared when a sync completes. | 228 | * Cleared when a sync completes. |
229 | */ | 229 | */ |
230 | 230 | ||
231 | struct page *spare_page; /* Used when checking P/Q in raid6 */ | ||
232 | |||
231 | /* | 233 | /* |
232 | * Free stripes pool | 234 | * Free stripes pool |
233 | */ | 235 | */ |