aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.de>2006-01-06 03:20:17 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2006-01-06 11:34:03 -0500
commitca65b73bd9c301d243df93780f7b26579e6c9204 (patch)
tree1ca8157e11033989ee94c20b1f2d4c936f9dc09c
parent6cce3b23f6f8e974c00af7a9b88f1d413ba368a8 (diff)
[PATCH] md: fix raid6 resync check/repair code
raid6 currently does not check the P/Q syndromes when doing a resync, it just calculates the correct value and writes it. Doing the check can reduce writes (often to 0) for a resync, and it is needed to properly implement the echo check > sync_action operation. This patch implements the appropriate checks and tidies up some related code. It also allows raid6 user-requested resync to bypass the intent bitmap. Signed-off-by: Neil Brown <neilb@suse.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--drivers/md/raid6main.c182
-rw-r--r--include/linux/raid/raid5.h2
2 files changed, 108 insertions, 76 deletions
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index 304455d236f9..52e8796bb8ac 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -805,7 +805,7 @@ static void compute_parity(struct stripe_head *sh, int method)
805} 805}
806 806
807/* Compute one missing block */ 807/* Compute one missing block */
808static void compute_block_1(struct stripe_head *sh, int dd_idx) 808static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
809{ 809{
810 raid6_conf_t *conf = sh->raid_conf; 810 raid6_conf_t *conf = sh->raid_conf;
811 int i, count, disks = conf->raid_disks; 811 int i, count, disks = conf->raid_disks;
@@ -821,7 +821,7 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx)
821 compute_parity(sh, UPDATE_PARITY); 821 compute_parity(sh, UPDATE_PARITY);
822 } else { 822 } else {
823 ptr[0] = page_address(sh->dev[dd_idx].page); 823 ptr[0] = page_address(sh->dev[dd_idx].page);
824 memset(ptr[0], 0, STRIPE_SIZE); 824 if (!nozero) memset(ptr[0], 0, STRIPE_SIZE);
825 count = 1; 825 count = 1;
826 for (i = disks ; i--; ) { 826 for (i = disks ; i--; ) {
827 if (i == dd_idx || i == qd_idx) 827 if (i == dd_idx || i == qd_idx)
@@ -838,7 +838,8 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx)
838 } 838 }
839 if (count != 1) 839 if (count != 1)
840 xor_block(count, STRIPE_SIZE, ptr); 840 xor_block(count, STRIPE_SIZE, ptr);
841 set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); 841 if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
842 else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
842 } 843 }
843} 844}
844 845
@@ -871,7 +872,7 @@ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
871 return; 872 return;
872 } else { 873 } else {
873 /* We're missing D+Q; recompute D from P */ 874 /* We're missing D+Q; recompute D from P */
874 compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1); 875 compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1, 0);
875 compute_parity(sh, UPDATE_PARITY); /* Is this necessary? */ 876 compute_parity(sh, UPDATE_PARITY); /* Is this necessary? */
876 return; 877 return;
877 } 878 }
@@ -982,6 +983,12 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
982} 983}
983 984
984 985
986static int page_is_zero(struct page *p)
987{
988 char *a = page_address(p);
989 return ((*(u32*)a) == 0 &&
990 memcmp(a, a+4, STRIPE_SIZE-4)==0);
991}
985/* 992/*
986 * handle_stripe - do things to a stripe. 993 * handle_stripe - do things to a stripe.
987 * 994 *
@@ -1000,7 +1007,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
1000 * 1007 *
1001 */ 1008 */
1002 1009
1003static void handle_stripe(struct stripe_head *sh) 1010static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
1004{ 1011{
1005 raid6_conf_t *conf = sh->raid_conf; 1012 raid6_conf_t *conf = sh->raid_conf;
1006 int disks = conf->raid_disks; 1013 int disks = conf->raid_disks;
@@ -1228,7 +1235,7 @@ static void handle_stripe(struct stripe_head *sh)
1228 if (uptodate == disks-1) { 1235 if (uptodate == disks-1) {
1229 PRINTK("Computing stripe %llu block %d\n", 1236 PRINTK("Computing stripe %llu block %d\n",
1230 (unsigned long long)sh->sector, i); 1237 (unsigned long long)sh->sector, i);
1231 compute_block_1(sh, i); 1238 compute_block_1(sh, i, 0);
1232 uptodate++; 1239 uptodate++;
1233 } else if ( uptodate == disks-2 && failed >= 2 ) { 1240 } else if ( uptodate == disks-2 && failed >= 2 ) {
1234 /* Computing 2-failure is *very* expensive; only do it if failed >= 2 */ 1241 /* Computing 2-failure is *very* expensive; only do it if failed >= 2 */
@@ -1323,7 +1330,7 @@ static void handle_stripe(struct stripe_head *sh)
1323 /* We have failed blocks and need to compute them */ 1330 /* We have failed blocks and need to compute them */
1324 switch ( failed ) { 1331 switch ( failed ) {
1325 case 0: BUG(); 1332 case 0: BUG();
1326 case 1: compute_block_1(sh, failed_num[0]); break; 1333 case 1: compute_block_1(sh, failed_num[0], 0); break;
1327 case 2: compute_block_2(sh, failed_num[0], failed_num[1]); break; 1334 case 2: compute_block_2(sh, failed_num[0], failed_num[1]); break;
1328 default: BUG(); /* This request should have been failed? */ 1335 default: BUG(); /* This request should have been failed? */
1329 } 1336 }
@@ -1338,12 +1345,10 @@ static void handle_stripe(struct stripe_head *sh)
1338 (unsigned long long)sh->sector, i); 1345 (unsigned long long)sh->sector, i);
1339 locked++; 1346 locked++;
1340 set_bit(R5_Wantwrite, &sh->dev[i].flags); 1347 set_bit(R5_Wantwrite, &sh->dev[i].flags);
1341#if 0 /**** FIX: I don't understand the logic here... ****/
1342 if (!test_bit(R5_Insync, &sh->dev[i].flags)
1343 || ((i==pd_idx || i==qd_idx) && failed == 0)) /* FIX? */
1344 set_bit(STRIPE_INSYNC, &sh->state);
1345#endif
1346 } 1348 }
1349 /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */
1350 set_bit(STRIPE_INSYNC, &sh->state);
1351
1347 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 1352 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
1348 atomic_dec(&conf->preread_active_stripes); 1353 atomic_dec(&conf->preread_active_stripes);
1349 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) 1354 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
@@ -1356,79 +1361,97 @@ static void handle_stripe(struct stripe_head *sh)
1356 * Any reads will already have been scheduled, so we just see if enough data 1361 * Any reads will already have been scheduled, so we just see if enough data
1357 * is available 1362 * is available
1358 */ 1363 */
1359 if (syncing && locked == 0 && 1364 if (syncing && locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) {
1360 !test_bit(STRIPE_INSYNC, &sh->state) && failed <= 2) { 1365 int update_p = 0, update_q = 0;
1361 set_bit(STRIPE_HANDLE, &sh->state); 1366 struct r5dev *dev;
1362#if 0 /* RAID-6: Don't support CHECK PARITY yet */
1363 if (failed == 0) {
1364 char *pagea;
1365 if (uptodate != disks)
1366 BUG();
1367 compute_parity(sh, CHECK_PARITY);
1368 uptodate--;
1369 pagea = page_address(sh->dev[pd_idx].page);
1370 if ((*(u32*)pagea) == 0 &&
1371 !memcmp(pagea, pagea+4, STRIPE_SIZE-4)) {
1372 /* parity is correct (on disc, not in buffer any more) */
1373 set_bit(STRIPE_INSYNC, &sh->state);
1374 }
1375 }
1376#endif
1377 if (!test_bit(STRIPE_INSYNC, &sh->state)) {
1378 int failed_needupdate[2];
1379 struct r5dev *adev, *bdev;
1380
1381 if ( failed < 1 )
1382 failed_num[0] = pd_idx;
1383 if ( failed < 2 )
1384 failed_num[1] = (failed_num[0] == qd_idx) ? pd_idx : qd_idx;
1385 1367
1386 failed_needupdate[0] = !test_bit(R5_UPTODATE, &sh->dev[failed_num[0]].flags); 1368 set_bit(STRIPE_HANDLE, &sh->state);
1387 failed_needupdate[1] = !test_bit(R5_UPTODATE, &sh->dev[failed_num[1]].flags);
1388 1369
1389 PRINTK("sync: failed=%d num=%d,%d fnu=%u%u\n", 1370 BUG_ON(failed>2);
1390 failed, failed_num[0], failed_num[1], failed_needupdate[0], failed_needupdate[1]); 1371 BUG_ON(uptodate < disks);
1372 /* Want to check and possibly repair P and Q.
1373 * However there could be one 'failed' device, in which
1374 * case we can only check one of them, possibly using the
1375 * other to generate missing data
1376 */
1391 1377
1392#if 0 /* RAID-6: This code seems to require that CHECK_PARITY destroys the uptodateness of the parity */ 1378 /* If !tmp_page, we cannot do the calculations,
1393 /* should be able to compute the missing block(s) and write to spare */ 1379 * but as we have set STRIPE_HANDLE, we will soon be called
1394 if ( failed_needupdate[0] ^ failed_needupdate[1] ) { 1380 * by stripe_handle with a tmp_page - just wait until then.
1395 if (uptodate+1 != disks) 1381 */
1396 BUG(); 1382 if (tmp_page) {
1397 compute_block_1(sh, failed_needupdate[0] ? failed_num[0] : failed_num[1]); 1383 if (failed == q_failed) {
1398 uptodate++; 1384 /* The only possible failed device holds 'Q', so it makes
1399 } else if ( failed_needupdate[0] & failed_needupdate[1] ) { 1385 * sense to check P (If anything else were failed, we would
1400 if (uptodate+2 != disks) 1386 * have used P to recreate it).
1401 BUG(); 1387 */
1402 compute_block_2(sh, failed_num[0], failed_num[1]); 1388 compute_block_1(sh, pd_idx, 1);
1403 uptodate += 2; 1389 if (!page_is_zero(sh->dev[pd_idx].page)) {
1390 compute_block_1(sh,pd_idx,0);
1391 update_p = 1;
1392 }
1393 }
1394 if (!q_failed && failed < 2) {
1395 /* q is not failed, and we didn't use it to generate
1396 * anything, so it makes sense to check it
1397 */
1398 memcpy(page_address(tmp_page),
1399 page_address(sh->dev[qd_idx].page),
1400 STRIPE_SIZE);
1401 compute_parity(sh, UPDATE_PARITY);
1402 if (memcmp(page_address(tmp_page),
1403 page_address(sh->dev[qd_idx].page),
1404 STRIPE_SIZE)!= 0) {
1405 clear_bit(STRIPE_INSYNC, &sh->state);
1406 update_q = 1;
1407 }
1408 }
1409 if (update_p || update_q) {
1410 conf->mddev->resync_mismatches += STRIPE_SECTORS;
1411 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
1412 /* don't try to repair!! */
1413 update_p = update_q = 0;
1404 } 1414 }
1405#else
1406 compute_block_2(sh, failed_num[0], failed_num[1]);
1407 uptodate += failed_needupdate[0] + failed_needupdate[1];
1408#endif
1409 1415
1410 if (uptodate != disks) 1416 /* now write out any block on a failed drive,
1411 BUG(); 1417 * or P or Q if they need it
1418 */
1412 1419
1413 PRINTK("Marking for sync stripe %llu blocks %d,%d\n", 1420 if (failed == 2) {
1414 (unsigned long long)sh->sector, failed_num[0], failed_num[1]); 1421 dev = &sh->dev[failed_num[1]];
1422 locked++;
1423 set_bit(R5_LOCKED, &dev->flags);
1424 set_bit(R5_Wantwrite, &dev->flags);
1425 set_bit(R5_Syncio, &dev->flags);
1426 }
1427 if (failed >= 1) {
1428 dev = &sh->dev[failed_num[0]];
1429 locked++;
1430 set_bit(R5_LOCKED, &dev->flags);
1431 set_bit(R5_Wantwrite, &dev->flags);
1432 set_bit(R5_Syncio, &dev->flags);
1433 }
1415 1434
1416 /**** FIX: Should we really do both of these unconditionally? ****/ 1435 if (update_p) {
1417 adev = &sh->dev[failed_num[0]]; 1436 dev = &sh->dev[pd_idx];
1418 locked += !test_bit(R5_LOCKED, &adev->flags); 1437 locked ++;
1419 set_bit(R5_LOCKED, &adev->flags); 1438 set_bit(R5_LOCKED, &dev->flags);
1420 set_bit(R5_Wantwrite, &adev->flags); 1439 set_bit(R5_Wantwrite, &dev->flags);
1421 bdev = &sh->dev[failed_num[1]]; 1440 set_bit(R5_Syncio, &dev->flags);
1422 locked += !test_bit(R5_LOCKED, &bdev->flags); 1441 }
1423 set_bit(R5_LOCKED, &bdev->flags); 1442 if (update_q) {
1443 dev = &sh->dev[qd_idx];
1444 locked++;
1445 set_bit(R5_LOCKED, &dev->flags);
1446 set_bit(R5_Wantwrite, &dev->flags);
1447 set_bit(R5_Syncio, &dev->flags);
1448 }
1424 clear_bit(STRIPE_DEGRADED, &sh->state); 1449 clear_bit(STRIPE_DEGRADED, &sh->state);
1425 set_bit(R5_Wantwrite, &bdev->flags);
1426 1450
1427 set_bit(STRIPE_INSYNC, &sh->state); 1451 set_bit(STRIPE_INSYNC, &sh->state);
1428 set_bit(R5_Syncio, &adev->flags);
1429 set_bit(R5_Syncio, &bdev->flags);
1430 } 1452 }
1431 } 1453 }
1454
1432 if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { 1455 if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
1433 md_done_sync(conf->mddev, STRIPE_SECTORS,1); 1456 md_done_sync(conf->mddev, STRIPE_SECTORS,1);
1434 clear_bit(STRIPE_SYNCING, &sh->state); 1457 clear_bit(STRIPE_SYNCING, &sh->state);
@@ -1664,7 +1687,7 @@ static int make_request (request_queue_t *q, struct bio * bi)
1664 } 1687 }
1665 finish_wait(&conf->wait_for_overlap, &w); 1688 finish_wait(&conf->wait_for_overlap, &w);
1666 raid6_plug_device(conf); 1689 raid6_plug_device(conf);
1667 handle_stripe(sh); 1690 handle_stripe(sh, NULL);
1668 release_stripe(sh); 1691 release_stripe(sh);
1669 } else { 1692 } else {
1670 /* cannot get stripe for read-ahead, just give-up */ 1693 /* cannot get stripe for read-ahead, just give-up */
@@ -1728,6 +1751,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1728 return rv; 1751 return rv;
1729 } 1752 }
1730 if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && 1753 if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
1754 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
1731 !conf->fullsync && sync_blocks >= STRIPE_SECTORS) { 1755 !conf->fullsync && sync_blocks >= STRIPE_SECTORS) {
1732 /* we can skip this block, and probably more */ 1756 /* we can skip this block, and probably more */
1733 sync_blocks /= STRIPE_SECTORS; 1757 sync_blocks /= STRIPE_SECTORS;
@@ -1765,7 +1789,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1765 clear_bit(STRIPE_INSYNC, &sh->state); 1789 clear_bit(STRIPE_INSYNC, &sh->state);
1766 spin_unlock(&sh->lock); 1790 spin_unlock(&sh->lock);
1767 1791
1768 handle_stripe(sh); 1792 handle_stripe(sh, NULL);
1769 release_stripe(sh); 1793 release_stripe(sh);
1770 1794
1771 return STRIPE_SECTORS; 1795 return STRIPE_SECTORS;
@@ -1821,7 +1845,7 @@ static void raid6d (mddev_t *mddev)
1821 spin_unlock_irq(&conf->device_lock); 1845 spin_unlock_irq(&conf->device_lock);
1822 1846
1823 handled++; 1847 handled++;
1824 handle_stripe(sh); 1848 handle_stripe(sh, conf->spare_page);
1825 release_stripe(sh); 1849 release_stripe(sh);
1826 1850
1827 spin_lock_irq(&conf->device_lock); 1851 spin_lock_irq(&conf->device_lock);
@@ -1860,6 +1884,10 @@ static int run(mddev_t *mddev)
1860 goto abort; 1884 goto abort;
1861 memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE); 1885 memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
1862 1886
1887 conf->spare_page = alloc_page(GFP_KERNEL);
1888 if (!conf->spare_page)
1889 goto abort;
1890
1863 spin_lock_init(&conf->device_lock); 1891 spin_lock_init(&conf->device_lock);
1864 init_waitqueue_head(&conf->wait_for_stripe); 1892 init_waitqueue_head(&conf->wait_for_stripe);
1865 init_waitqueue_head(&conf->wait_for_overlap); 1893 init_waitqueue_head(&conf->wait_for_overlap);
@@ -1996,6 +2024,8 @@ static int run(mddev_t *mddev)
1996abort: 2024abort:
1997 if (conf) { 2025 if (conf) {
1998 print_raid6_conf(conf); 2026 print_raid6_conf(conf);
2027 if (conf->spare_page)
2028 page_cache_release(conf->spare_page);
1999 if (conf->stripe_hashtbl) 2029 if (conf->stripe_hashtbl)
2000 free_pages((unsigned long) conf->stripe_hashtbl, 2030 free_pages((unsigned long) conf->stripe_hashtbl,
2001 HASH_PAGES_ORDER); 2031 HASH_PAGES_ORDER);
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index f025ba6fb14c..e9c1c0d4f90b 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -228,6 +228,8 @@ struct raid5_private_data {
228 * Cleared when a sync completes. 228 * Cleared when a sync completes.
229 */ 229 */
230 230
231 struct page *spare_page; /* Used when checking P/Q in raid6 */
232
231 /* 233 /*
232 * Free stripes pool 234 * Free stripes pool
233 */ 235 */