diff options
author | Stefan Behrens <sbehrens@giantdisaster.de> | 2012-05-25 10:06:08 -0400 |
---|---|---|
committer | Josef Bacik <josef@redhat.com> | 2012-05-30 10:23:39 -0400 |
commit | 442a4f6308e694e0fa6025708bd5e4e424bbf51c (patch) | |
tree | e782db1bcbec25283048d77871e0bed7ad04567c /fs/btrfs/scrub.c | |
parent | d07eb9117050c9ed3f78296ebcc06128b52693be (diff) |
Btrfs: add device counters for detected IO and checksum errors
The goal is to detect when drives start to get an increased error rate,
when drives should be replaced soon. Therefore statistic counters are
added that count IO errors (read, write and flush). Additionally, the
software detected errors like checksum errors and corrupted blocks are
counted.
Signed-off-by: Stefan Behrens <sbehrens@giantdisaster.de>
Diffstat (limited to 'fs/btrfs/scrub.c')
-rw-r--r-- | fs/btrfs/scrub.c | 65 |
1 files changed, 49 insertions, 16 deletions
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 2f3d6f917fb3..a38cfa4f251e 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c | |||
@@ -50,7 +50,7 @@ struct scrub_dev; | |||
50 | struct scrub_page { | 50 | struct scrub_page { |
51 | struct scrub_block *sblock; | 51 | struct scrub_block *sblock; |
52 | struct page *page; | 52 | struct page *page; |
53 | struct block_device *bdev; | 53 | struct btrfs_device *dev; |
54 | u64 flags; /* extent flags */ | 54 | u64 flags; /* extent flags */ |
55 | u64 generation; | 55 | u64 generation; |
56 | u64 logical; | 56 | u64 logical; |
@@ -86,6 +86,7 @@ struct scrub_block { | |||
86 | unsigned int header_error:1; | 86 | unsigned int header_error:1; |
87 | unsigned int checksum_error:1; | 87 | unsigned int checksum_error:1; |
88 | unsigned int no_io_error_seen:1; | 88 | unsigned int no_io_error_seen:1; |
89 | unsigned int generation_error:1; /* also sets header_error */ | ||
89 | }; | 90 | }; |
90 | }; | 91 | }; |
91 | 92 | ||
@@ -675,6 +676,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) | |||
675 | sdev->stat.read_errors++; | 676 | sdev->stat.read_errors++; |
676 | sdev->stat.uncorrectable_errors++; | 677 | sdev->stat.uncorrectable_errors++; |
677 | spin_unlock(&sdev->stat_lock); | 678 | spin_unlock(&sdev->stat_lock); |
679 | btrfs_dev_stat_inc_and_print(sdev->dev, | ||
680 | BTRFS_DEV_STAT_READ_ERRS); | ||
678 | goto out; | 681 | goto out; |
679 | } | 682 | } |
680 | 683 | ||
@@ -686,6 +689,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) | |||
686 | sdev->stat.read_errors++; | 689 | sdev->stat.read_errors++; |
687 | sdev->stat.uncorrectable_errors++; | 690 | sdev->stat.uncorrectable_errors++; |
688 | spin_unlock(&sdev->stat_lock); | 691 | spin_unlock(&sdev->stat_lock); |
692 | btrfs_dev_stat_inc_and_print(sdev->dev, | ||
693 | BTRFS_DEV_STAT_READ_ERRS); | ||
689 | goto out; | 694 | goto out; |
690 | } | 695 | } |
691 | BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS); | 696 | BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS); |
@@ -699,6 +704,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) | |||
699 | sdev->stat.read_errors++; | 704 | sdev->stat.read_errors++; |
700 | sdev->stat.uncorrectable_errors++; | 705 | sdev->stat.uncorrectable_errors++; |
701 | spin_unlock(&sdev->stat_lock); | 706 | spin_unlock(&sdev->stat_lock); |
707 | btrfs_dev_stat_inc_and_print(sdev->dev, | ||
708 | BTRFS_DEV_STAT_READ_ERRS); | ||
702 | goto out; | 709 | goto out; |
703 | } | 710 | } |
704 | 711 | ||
@@ -725,12 +732,16 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) | |||
725 | spin_unlock(&sdev->stat_lock); | 732 | spin_unlock(&sdev->stat_lock); |
726 | if (__ratelimit(&_rs)) | 733 | if (__ratelimit(&_rs)) |
727 | scrub_print_warning("i/o error", sblock_to_check); | 734 | scrub_print_warning("i/o error", sblock_to_check); |
735 | btrfs_dev_stat_inc_and_print(sdev->dev, | ||
736 | BTRFS_DEV_STAT_READ_ERRS); | ||
728 | } else if (sblock_bad->checksum_error) { | 737 | } else if (sblock_bad->checksum_error) { |
729 | spin_lock(&sdev->stat_lock); | 738 | spin_lock(&sdev->stat_lock); |
730 | sdev->stat.csum_errors++; | 739 | sdev->stat.csum_errors++; |
731 | spin_unlock(&sdev->stat_lock); | 740 | spin_unlock(&sdev->stat_lock); |
732 | if (__ratelimit(&_rs)) | 741 | if (__ratelimit(&_rs)) |
733 | scrub_print_warning("checksum error", sblock_to_check); | 742 | scrub_print_warning("checksum error", sblock_to_check); |
743 | btrfs_dev_stat_inc_and_print(sdev->dev, | ||
744 | BTRFS_DEV_STAT_CORRUPTION_ERRS); | ||
734 | } else if (sblock_bad->header_error) { | 745 | } else if (sblock_bad->header_error) { |
735 | spin_lock(&sdev->stat_lock); | 746 | spin_lock(&sdev->stat_lock); |
736 | sdev->stat.verify_errors++; | 747 | sdev->stat.verify_errors++; |
@@ -738,6 +749,12 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) | |||
738 | if (__ratelimit(&_rs)) | 749 | if (__ratelimit(&_rs)) |
739 | scrub_print_warning("checksum/header error", | 750 | scrub_print_warning("checksum/header error", |
740 | sblock_to_check); | 751 | sblock_to_check); |
752 | if (sblock_bad->generation_error) | ||
753 | btrfs_dev_stat_inc_and_print(sdev->dev, | ||
754 | BTRFS_DEV_STAT_GENERATION_ERRS); | ||
755 | else | ||
756 | btrfs_dev_stat_inc_and_print(sdev->dev, | ||
757 | BTRFS_DEV_STAT_CORRUPTION_ERRS); | ||
741 | } | 758 | } |
742 | 759 | ||
743 | if (sdev->readonly) | 760 | if (sdev->readonly) |
@@ -998,8 +1015,8 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev, | |||
998 | page = sblock->pagev + page_index; | 1015 | page = sblock->pagev + page_index; |
999 | page->logical = logical; | 1016 | page->logical = logical; |
1000 | page->physical = bbio->stripes[mirror_index].physical; | 1017 | page->physical = bbio->stripes[mirror_index].physical; |
1001 | /* for missing devices, bdev is NULL */ | 1018 | /* for missing devices, dev->bdev is NULL */ |
1002 | page->bdev = bbio->stripes[mirror_index].dev->bdev; | 1019 | page->dev = bbio->stripes[mirror_index].dev; |
1003 | page->mirror_num = mirror_index + 1; | 1020 | page->mirror_num = mirror_index + 1; |
1004 | page->page = alloc_page(GFP_NOFS); | 1021 | page->page = alloc_page(GFP_NOFS); |
1005 | if (!page->page) { | 1022 | if (!page->page) { |
@@ -1043,7 +1060,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info, | |||
1043 | struct scrub_page *page = sblock->pagev + page_num; | 1060 | struct scrub_page *page = sblock->pagev + page_num; |
1044 | DECLARE_COMPLETION_ONSTACK(complete); | 1061 | DECLARE_COMPLETION_ONSTACK(complete); |
1045 | 1062 | ||
1046 | if (page->bdev == NULL) { | 1063 | if (page->dev->bdev == NULL) { |
1047 | page->io_error = 1; | 1064 | page->io_error = 1; |
1048 | sblock->no_io_error_seen = 0; | 1065 | sblock->no_io_error_seen = 0; |
1049 | continue; | 1066 | continue; |
@@ -1053,7 +1070,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info, | |||
1053 | bio = bio_alloc(GFP_NOFS, 1); | 1070 | bio = bio_alloc(GFP_NOFS, 1); |
1054 | if (!bio) | 1071 | if (!bio) |
1055 | return -EIO; | 1072 | return -EIO; |
1056 | bio->bi_bdev = page->bdev; | 1073 | bio->bi_bdev = page->dev->bdev; |
1057 | bio->bi_sector = page->physical >> 9; | 1074 | bio->bi_sector = page->physical >> 9; |
1058 | bio->bi_end_io = scrub_complete_bio_end_io; | 1075 | bio->bi_end_io = scrub_complete_bio_end_io; |
1059 | bio->bi_private = &complete; | 1076 | bio->bi_private = &complete; |
@@ -1102,11 +1119,14 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, | |||
1102 | h = (struct btrfs_header *)mapped_buffer; | 1119 | h = (struct btrfs_header *)mapped_buffer; |
1103 | 1120 | ||
1104 | if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) || | 1121 | if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) || |
1105 | generation != le64_to_cpu(h->generation) || | ||
1106 | memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) || | 1122 | memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) || |
1107 | memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, | 1123 | memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, |
1108 | BTRFS_UUID_SIZE)) | 1124 | BTRFS_UUID_SIZE)) { |
1109 | sblock->header_error = 1; | 1125 | sblock->header_error = 1; |
1126 | } else if (generation != le64_to_cpu(h->generation)) { | ||
1127 | sblock->header_error = 1; | ||
1128 | sblock->generation_error = 1; | ||
1129 | } | ||
1110 | csum = h->csum; | 1130 | csum = h->csum; |
1111 | } else { | 1131 | } else { |
1112 | if (!have_csum) | 1132 | if (!have_csum) |
@@ -1182,7 +1202,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, | |||
1182 | bio = bio_alloc(GFP_NOFS, 1); | 1202 | bio = bio_alloc(GFP_NOFS, 1); |
1183 | if (!bio) | 1203 | if (!bio) |
1184 | return -EIO; | 1204 | return -EIO; |
1185 | bio->bi_bdev = page_bad->bdev; | 1205 | bio->bi_bdev = page_bad->dev->bdev; |
1186 | bio->bi_sector = page_bad->physical >> 9; | 1206 | bio->bi_sector = page_bad->physical >> 9; |
1187 | bio->bi_end_io = scrub_complete_bio_end_io; | 1207 | bio->bi_end_io = scrub_complete_bio_end_io; |
1188 | bio->bi_private = &complete; | 1208 | bio->bi_private = &complete; |
@@ -1196,6 +1216,12 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, | |||
1196 | 1216 | ||
1197 | /* this will also unplug the queue */ | 1217 | /* this will also unplug the queue */ |
1198 | wait_for_completion(&complete); | 1218 | wait_for_completion(&complete); |
1219 | if (!bio_flagged(bio, BIO_UPTODATE)) { | ||
1220 | btrfs_dev_stat_inc_and_print(page_bad->dev, | ||
1221 | BTRFS_DEV_STAT_WRITE_ERRS); | ||
1222 | bio_put(bio); | ||
1223 | return -EIO; | ||
1224 | } | ||
1199 | bio_put(bio); | 1225 | bio_put(bio); |
1200 | } | 1226 | } |
1201 | 1227 | ||
@@ -1352,7 +1378,8 @@ static int scrub_checksum_super(struct scrub_block *sblock) | |||
1352 | u64 mapped_size; | 1378 | u64 mapped_size; |
1353 | void *p; | 1379 | void *p; |
1354 | u32 crc = ~(u32)0; | 1380 | u32 crc = ~(u32)0; |
1355 | int fail = 0; | 1381 | int fail_gen = 0; |
1382 | int fail_cor = 0; | ||
1356 | u64 len; | 1383 | u64 len; |
1357 | int index; | 1384 | int index; |
1358 | 1385 | ||
@@ -1363,13 +1390,13 @@ static int scrub_checksum_super(struct scrub_block *sblock) | |||
1363 | memcpy(on_disk_csum, s->csum, sdev->csum_size); | 1390 | memcpy(on_disk_csum, s->csum, sdev->csum_size); |
1364 | 1391 | ||
1365 | if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr)) | 1392 | if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr)) |
1366 | ++fail; | 1393 | ++fail_cor; |
1367 | 1394 | ||
1368 | if (sblock->pagev[0].generation != le64_to_cpu(s->generation)) | 1395 | if (sblock->pagev[0].generation != le64_to_cpu(s->generation)) |
1369 | ++fail; | 1396 | ++fail_gen; |
1370 | 1397 | ||
1371 | if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) | 1398 | if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) |
1372 | ++fail; | 1399 | ++fail_cor; |
1373 | 1400 | ||
1374 | len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE; | 1401 | len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE; |
1375 | mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; | 1402 | mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; |
@@ -1394,9 +1421,9 @@ static int scrub_checksum_super(struct scrub_block *sblock) | |||
1394 | 1421 | ||
1395 | btrfs_csum_final(crc, calculated_csum); | 1422 | btrfs_csum_final(crc, calculated_csum); |
1396 | if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) | 1423 | if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) |
1397 | ++fail; | 1424 | ++fail_cor; |
1398 | 1425 | ||
1399 | if (fail) { | 1426 | if (fail_cor + fail_gen) { |
1400 | /* | 1427 | /* |
1401 | * if we find an error in a super block, we just report it. | 1428 | * if we find an error in a super block, we just report it. |
1402 | * They will get written with the next transaction commit | 1429 | * They will get written with the next transaction commit |
@@ -1405,9 +1432,15 @@ static int scrub_checksum_super(struct scrub_block *sblock) | |||
1405 | spin_lock(&sdev->stat_lock); | 1432 | spin_lock(&sdev->stat_lock); |
1406 | ++sdev->stat.super_errors; | 1433 | ++sdev->stat.super_errors; |
1407 | spin_unlock(&sdev->stat_lock); | 1434 | spin_unlock(&sdev->stat_lock); |
1435 | if (fail_cor) | ||
1436 | btrfs_dev_stat_inc_and_print(sdev->dev, | ||
1437 | BTRFS_DEV_STAT_CORRUPTION_ERRS); | ||
1438 | else | ||
1439 | btrfs_dev_stat_inc_and_print(sdev->dev, | ||
1440 | BTRFS_DEV_STAT_GENERATION_ERRS); | ||
1408 | } | 1441 | } |
1409 | 1442 | ||
1410 | return fail; | 1443 | return fail_cor + fail_gen; |
1411 | } | 1444 | } |
1412 | 1445 | ||
1413 | static void scrub_block_get(struct scrub_block *sblock) | 1446 | static void scrub_block_get(struct scrub_block *sblock) |
@@ -1551,7 +1584,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, | |||
1551 | return -ENOMEM; | 1584 | return -ENOMEM; |
1552 | } | 1585 | } |
1553 | spage->sblock = sblock; | 1586 | spage->sblock = sblock; |
1554 | spage->bdev = sdev->dev->bdev; | 1587 | spage->dev = sdev->dev; |
1555 | spage->flags = flags; | 1588 | spage->flags = flags; |
1556 | spage->generation = gen; | 1589 | spage->generation = gen; |
1557 | spage->logical = logical; | 1590 | spage->logical = logical; |