diff options
author | Stefan Behrens <sbehrens@giantdisaster.de> | 2012-03-27 14:21:27 -0400 |
---|---|---|
committer | Chris Mason <chris.mason@oracle.com> | 2012-03-27 14:21:27 -0400 |
commit | b5d67f64f9bc656970dacba245410f0faedad18e (patch) | |
tree | 3c267dbc01ae04dc827a563dc91baafdae14582a /fs/btrfs/scrub.c | |
parent | 1623edebee317855c6a854366c01d1630cc537c9 (diff) |
Btrfs: change scrub to support big blocks
Scrub used to be coded for nodesize == leafsize == sectorsize == PAGE_SIZE.
This is now changed to support sizes for nodesize and leafsize which are
N * PAGE_SIZE.
Signed-off-by: Stefan Behrens <sbehrens@giantdisaster.de>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs/btrfs/scrub.c')
-rw-r--r-- | fs/btrfs/scrub.c | 1353 |
1 files changed, 1013 insertions, 340 deletions
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index e68bab4ffcd4..5221e072bb65 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c | |||
@@ -40,16 +40,26 @@ | |||
40 | * - add a mode to also read unallocated space | 40 | * - add a mode to also read unallocated space |
41 | */ | 41 | */ |
42 | 42 | ||
43 | struct scrub_block; | ||
43 | struct scrub_dev; | 44 | struct scrub_dev; |
44 | 45 | ||
45 | #define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */ | 46 | #define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */ |
46 | #define SCRUB_BIOS_PER_DEV 16 /* 1 MB per device in flight */ | 47 | #define SCRUB_BIOS_PER_DEV 16 /* 1 MB per device in flight */ |
48 | #define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */ | ||
47 | 49 | ||
48 | struct scrub_page { | 50 | struct scrub_page { |
51 | struct scrub_block *sblock; | ||
52 | struct page *page; | ||
53 | struct block_device *bdev; | ||
49 | u64 flags; /* extent flags */ | 54 | u64 flags; /* extent flags */ |
50 | u64 generation; | 55 | u64 generation; |
51 | int mirror_num; | 56 | u64 logical; |
52 | int have_csum; | 57 | u64 physical; |
58 | struct { | ||
59 | unsigned int mirror_num:8; | ||
60 | unsigned int have_csum:1; | ||
61 | unsigned int io_error:1; | ||
62 | }; | ||
53 | u8 csum[BTRFS_CSUM_SIZE]; | 63 | u8 csum[BTRFS_CSUM_SIZE]; |
54 | }; | 64 | }; |
55 | 65 | ||
@@ -60,12 +70,25 @@ struct scrub_bio { | |||
60 | int err; | 70 | int err; |
61 | u64 logical; | 71 | u64 logical; |
62 | u64 physical; | 72 | u64 physical; |
63 | struct scrub_page spag[SCRUB_PAGES_PER_BIO]; | 73 | struct scrub_page *pagev[SCRUB_PAGES_PER_BIO]; |
64 | u64 count; | 74 | int page_count; |
65 | int next_free; | 75 | int next_free; |
66 | struct btrfs_work work; | 76 | struct btrfs_work work; |
67 | }; | 77 | }; |
68 | 78 | ||
79 | struct scrub_block { | ||
80 | struct scrub_page pagev[SCRUB_MAX_PAGES_PER_BLOCK]; | ||
81 | int page_count; | ||
82 | atomic_t outstanding_pages; | ||
83 | atomic_t ref_count; /* free mem on transition to zero */ | ||
84 | struct scrub_dev *sdev; | ||
85 | struct { | ||
86 | unsigned int header_error:1; | ||
87 | unsigned int checksum_error:1; | ||
88 | unsigned int no_io_error_seen:1; | ||
89 | }; | ||
90 | }; | ||
91 | |||
69 | struct scrub_dev { | 92 | struct scrub_dev { |
70 | struct scrub_bio *bios[SCRUB_BIOS_PER_DEV]; | 93 | struct scrub_bio *bios[SCRUB_BIOS_PER_DEV]; |
71 | struct btrfs_device *dev; | 94 | struct btrfs_device *dev; |
@@ -79,6 +102,10 @@ struct scrub_dev { | |||
79 | struct list_head csum_list; | 102 | struct list_head csum_list; |
80 | atomic_t cancel_req; | 103 | atomic_t cancel_req; |
81 | int readonly; | 104 | int readonly; |
105 | int pages_per_bio; /* <= SCRUB_PAGES_PER_BIO */ | ||
106 | u32 sectorsize; | ||
107 | u32 nodesize; | ||
108 | u32 leafsize; | ||
82 | /* | 109 | /* |
83 | * statistics | 110 | * statistics |
84 | */ | 111 | */ |
@@ -107,19 +134,41 @@ struct scrub_warning { | |||
107 | int scratch_bufsize; | 134 | int scratch_bufsize; |
108 | }; | 135 | }; |
109 | 136 | ||
137 | |||
138 | static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); | ||
139 | static int scrub_setup_recheck_block(struct scrub_dev *sdev, | ||
140 | struct btrfs_mapping_tree *map_tree, | ||
141 | u64 length, u64 logical, | ||
142 | struct scrub_block *sblock); | ||
143 | static int scrub_recheck_block(struct btrfs_fs_info *fs_info, | ||
144 | struct scrub_block *sblock, int is_metadata, | ||
145 | int have_csum, u8 *csum, u64 generation, | ||
146 | u16 csum_size); | ||
147 | static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, | ||
148 | struct scrub_block *sblock, | ||
149 | int is_metadata, int have_csum, | ||
150 | const u8 *csum, u64 generation, | ||
151 | u16 csum_size); | ||
152 | static void scrub_complete_bio_end_io(struct bio *bio, int err); | ||
153 | static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, | ||
154 | struct scrub_block *sblock_good, | ||
155 | int force_write); | ||
156 | static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, | ||
157 | struct scrub_block *sblock_good, | ||
158 | int page_num, int force_write); | ||
159 | static int scrub_checksum_data(struct scrub_block *sblock); | ||
160 | static int scrub_checksum_tree_block(struct scrub_block *sblock); | ||
161 | static int scrub_checksum_super(struct scrub_block *sblock); | ||
162 | static void scrub_block_get(struct scrub_block *sblock); | ||
163 | static void scrub_block_put(struct scrub_block *sblock); | ||
164 | static int scrub_add_page_to_bio(struct scrub_dev *sdev, | ||
165 | struct scrub_page *spage); | ||
166 | static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, | ||
167 | u64 physical, u64 flags, u64 gen, int mirror_num, | ||
168 | u8 *csum, int force); | ||
110 | static void scrub_bio_end_io(struct bio *bio, int err); | 169 | static void scrub_bio_end_io(struct bio *bio, int err); |
111 | static void scrub_checksum(struct btrfs_work *work); | 170 | static void scrub_bio_end_io_worker(struct btrfs_work *work); |
112 | static int scrub_checksum_data(struct scrub_dev *sdev, | 171 | static void scrub_block_complete(struct scrub_block *sblock); |
113 | struct scrub_page *spag, void *buffer); | ||
114 | static int scrub_checksum_tree_block(struct scrub_dev *sdev, | ||
115 | struct scrub_page *spag, u64 logical, | ||
116 | void *buffer); | ||
117 | static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer); | ||
118 | static int scrub_fixup_check(struct scrub_bio *sbio, int ix); | ||
119 | static void scrub_fixup_end_io(struct bio *bio, int err); | ||
120 | static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, | ||
121 | struct page *page); | ||
122 | static void scrub_fixup(struct scrub_bio *sbio, int ix); | ||
123 | 172 | ||
124 | 173 | ||
125 | static void scrub_free_csums(struct scrub_dev *sdev) | 174 | static void scrub_free_csums(struct scrub_dev *sdev) |
@@ -133,23 +182,6 @@ static void scrub_free_csums(struct scrub_dev *sdev) | |||
133 | } | 182 | } |
134 | } | 183 | } |
135 | 184 | ||
136 | static void scrub_free_bio(struct bio *bio) | ||
137 | { | ||
138 | int i; | ||
139 | struct page *last_page = NULL; | ||
140 | |||
141 | if (!bio) | ||
142 | return; | ||
143 | |||
144 | for (i = 0; i < bio->bi_vcnt; ++i) { | ||
145 | if (bio->bi_io_vec[i].bv_page == last_page) | ||
146 | continue; | ||
147 | last_page = bio->bi_io_vec[i].bv_page; | ||
148 | __free_page(last_page); | ||
149 | } | ||
150 | bio_put(bio); | ||
151 | } | ||
152 | |||
153 | static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev) | 185 | static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev) |
154 | { | 186 | { |
155 | int i; | 187 | int i; |
@@ -157,13 +189,23 @@ static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev) | |||
157 | if (!sdev) | 189 | if (!sdev) |
158 | return; | 190 | return; |
159 | 191 | ||
192 | /* this can happen when scrub is cancelled */ | ||
193 | if (sdev->curr != -1) { | ||
194 | struct scrub_bio *sbio = sdev->bios[sdev->curr]; | ||
195 | |||
196 | for (i = 0; i < sbio->page_count; i++) { | ||
197 | BUG_ON(!sbio->pagev[i]); | ||
198 | BUG_ON(!sbio->pagev[i]->page); | ||
199 | scrub_block_put(sbio->pagev[i]->sblock); | ||
200 | } | ||
201 | bio_put(sbio->bio); | ||
202 | } | ||
203 | |||
160 | for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { | 204 | for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { |
161 | struct scrub_bio *sbio = sdev->bios[i]; | 205 | struct scrub_bio *sbio = sdev->bios[i]; |
162 | 206 | ||
163 | if (!sbio) | 207 | if (!sbio) |
164 | break; | 208 | break; |
165 | |||
166 | scrub_free_bio(sbio->bio); | ||
167 | kfree(sbio); | 209 | kfree(sbio); |
168 | } | 210 | } |
169 | 211 | ||
@@ -177,11 +219,16 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) | |||
177 | struct scrub_dev *sdev; | 219 | struct scrub_dev *sdev; |
178 | int i; | 220 | int i; |
179 | struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; | 221 | struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; |
222 | int pages_per_bio; | ||
180 | 223 | ||
224 | pages_per_bio = min_t(int, SCRUB_PAGES_PER_BIO, | ||
225 | bio_get_nr_vecs(dev->bdev)); | ||
181 | sdev = kzalloc(sizeof(*sdev), GFP_NOFS); | 226 | sdev = kzalloc(sizeof(*sdev), GFP_NOFS); |
182 | if (!sdev) | 227 | if (!sdev) |
183 | goto nomem; | 228 | goto nomem; |
184 | sdev->dev = dev; | 229 | sdev->dev = dev; |
230 | sdev->pages_per_bio = pages_per_bio; | ||
231 | sdev->curr = -1; | ||
185 | for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { | 232 | for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { |
186 | struct scrub_bio *sbio; | 233 | struct scrub_bio *sbio; |
187 | 234 | ||
@@ -192,8 +239,8 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) | |||
192 | 239 | ||
193 | sbio->index = i; | 240 | sbio->index = i; |
194 | sbio->sdev = sdev; | 241 | sbio->sdev = sdev; |
195 | sbio->count = 0; | 242 | sbio->page_count = 0; |
196 | sbio->work.func = scrub_checksum; | 243 | sbio->work.func = scrub_bio_end_io_worker; |
197 | 244 | ||
198 | if (i != SCRUB_BIOS_PER_DEV-1) | 245 | if (i != SCRUB_BIOS_PER_DEV-1) |
199 | sdev->bios[i]->next_free = i + 1; | 246 | sdev->bios[i]->next_free = i + 1; |
@@ -201,7 +248,9 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) | |||
201 | sdev->bios[i]->next_free = -1; | 248 | sdev->bios[i]->next_free = -1; |
202 | } | 249 | } |
203 | sdev->first_free = 0; | 250 | sdev->first_free = 0; |
204 | sdev->curr = -1; | 251 | sdev->nodesize = dev->dev_root->nodesize; |
252 | sdev->leafsize = dev->dev_root->leafsize; | ||
253 | sdev->sectorsize = dev->dev_root->sectorsize; | ||
205 | atomic_set(&sdev->in_flight, 0); | 254 | atomic_set(&sdev->in_flight, 0); |
206 | atomic_set(&sdev->fixup_cnt, 0); | 255 | atomic_set(&sdev->fixup_cnt, 0); |
207 | atomic_set(&sdev->cancel_req, 0); | 256 | atomic_set(&sdev->cancel_req, 0); |
@@ -292,10 +341,9 @@ err: | |||
292 | return 0; | 341 | return 0; |
293 | } | 342 | } |
294 | 343 | ||
295 | static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio, | 344 | static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) |
296 | int ix) | ||
297 | { | 345 | { |
298 | struct btrfs_device *dev = sbio->sdev->dev; | 346 | struct btrfs_device *dev = sblock->sdev->dev; |
299 | struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; | 347 | struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; |
300 | struct btrfs_path *path; | 348 | struct btrfs_path *path; |
301 | struct btrfs_key found_key; | 349 | struct btrfs_key found_key; |
@@ -314,8 +362,9 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio, | |||
314 | 362 | ||
315 | swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS); | 363 | swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS); |
316 | swarn.msg_buf = kmalloc(bufsize, GFP_NOFS); | 364 | swarn.msg_buf = kmalloc(bufsize, GFP_NOFS); |
317 | swarn.sector = (sbio->physical + ix * PAGE_SIZE) >> 9; | 365 | BUG_ON(sblock->page_count < 1); |
318 | swarn.logical = sbio->logical + ix * PAGE_SIZE; | 366 | swarn.sector = (sblock->pagev[0].physical) >> 9; |
367 | swarn.logical = sblock->pagev[0].logical; | ||
319 | swarn.errstr = errstr; | 368 | swarn.errstr = errstr; |
320 | swarn.dev = dev; | 369 | swarn.dev = dev; |
321 | swarn.msg_bufsize = bufsize; | 370 | swarn.msg_bufsize = bufsize; |
@@ -530,9 +579,9 @@ out: | |||
530 | spin_lock(&sdev->stat_lock); | 579 | spin_lock(&sdev->stat_lock); |
531 | ++sdev->stat.uncorrectable_errors; | 580 | ++sdev->stat.uncorrectable_errors; |
532 | spin_unlock(&sdev->stat_lock); | 581 | spin_unlock(&sdev->stat_lock); |
533 | printk_ratelimited(KERN_ERR "btrfs: unable to fixup " | 582 | printk_ratelimited(KERN_ERR |
534 | "(nodatasum) error at logical %llu\n", | 583 | "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n", |
535 | fixup->logical); | 584 | (unsigned long long)fixup->logical, sdev->dev->name); |
536 | } | 585 | } |
537 | 586 | ||
538 | btrfs_free_path(path); | 587 | btrfs_free_path(path); |
@@ -549,91 +598,168 @@ out: | |||
549 | } | 598 | } |
550 | 599 | ||
551 | /* | 600 | /* |
552 | * scrub_recheck_error gets called when either verification of the page | 601 | * scrub_handle_errored_block gets called when either verification of the |
553 | * failed or the bio failed to read, e.g. with EIO. In the latter case, | 602 | * pages failed or the bio failed to read, e.g. with EIO. In the latter |
554 | * recheck_error gets called for every page in the bio, even though only | 603 | * case, this function handles all pages in the bio, even though only one |
555 | * one may be bad | 604 | * may be bad. |
605 | * The goal of this function is to repair the errored block by using the | ||
606 | * contents of one of the mirrors. | ||
556 | */ | 607 | */ |
557 | static int scrub_recheck_error(struct scrub_bio *sbio, int ix) | 608 | static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) |
558 | { | 609 | { |
559 | struct scrub_dev *sdev = sbio->sdev; | 610 | struct scrub_dev *sdev = sblock_to_check->sdev; |
560 | u64 sector = (sbio->physical + ix * PAGE_SIZE) >> 9; | 611 | struct btrfs_fs_info *fs_info; |
612 | u64 length; | ||
613 | u64 logical; | ||
614 | u64 generation; | ||
615 | unsigned int failed_mirror_index; | ||
616 | unsigned int is_metadata; | ||
617 | unsigned int have_csum; | ||
618 | u8 *csum; | ||
619 | struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */ | ||
620 | struct scrub_block *sblock_bad; | ||
621 | int ret; | ||
622 | int mirror_index; | ||
623 | int page_num; | ||
624 | int success; | ||
561 | static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, | 625 | static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, |
562 | DEFAULT_RATELIMIT_BURST); | 626 | DEFAULT_RATELIMIT_BURST); |
627 | |||
628 | BUG_ON(sblock_to_check->page_count < 1); | ||
629 | fs_info = sdev->dev->dev_root->fs_info; | ||
630 | length = sblock_to_check->page_count * PAGE_SIZE; | ||
631 | logical = sblock_to_check->pagev[0].logical; | ||
632 | generation = sblock_to_check->pagev[0].generation; | ||
633 | BUG_ON(sblock_to_check->pagev[0].mirror_num < 1); | ||
634 | failed_mirror_index = sblock_to_check->pagev[0].mirror_num - 1; | ||
635 | is_metadata = !(sblock_to_check->pagev[0].flags & | ||
636 | BTRFS_EXTENT_FLAG_DATA); | ||
637 | have_csum = sblock_to_check->pagev[0].have_csum; | ||
638 | csum = sblock_to_check->pagev[0].csum; | ||
563 | 639 | ||
564 | if (sbio->err) { | 640 | /* |
565 | if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector, | 641 | * read all mirrors one after the other. This includes to |
566 | sbio->bio->bi_io_vec[ix].bv_page) == 0) { | 642 | * re-read the extent or metadata block that failed (that was |
567 | if (scrub_fixup_check(sbio, ix) == 0) | 643 | * the cause that this fixup code is called) another time, |
568 | return 0; | 644 | * page by page this time in order to know which pages |
569 | } | 645 | * caused I/O errors and which ones are good (for all mirrors). |
570 | if (__ratelimit(&_rs)) | 646 | * It is the goal to handle the situation when more than one |
571 | scrub_print_warning("i/o error", sbio, ix); | 647 | * mirror contains I/O errors, but the errors do not |
572 | } else { | 648 | * overlap, i.e. the data can be repaired by selecting the |
573 | if (__ratelimit(&_rs)) | 649 | * pages from those mirrors without I/O error on the |
574 | scrub_print_warning("checksum error", sbio, ix); | 650 | * particular pages. One example (with blocks >= 2 * PAGE_SIZE) |
651 | * would be that mirror #1 has an I/O error on the first page, | ||
652 | * the second page is good, and mirror #2 has an I/O error on | ||
653 | * the second page, but the first page is good. | ||
654 | * Then the first page of the first mirror can be repaired by | ||
655 | * taking the first page of the second mirror, and the | ||
656 | * second page of the second mirror can be repaired by | ||
657 | * copying the contents of the 2nd page of the 1st mirror. | ||
658 | * One more note: if the pages of one mirror contain I/O | ||
659 | * errors, the checksum cannot be verified. In order to get | ||
660 | * the best data for repairing, the first attempt is to find | ||
661 | * a mirror without I/O errors and with a validated checksum. | ||
662 | * Only if this is not possible, the pages are picked from | ||
663 | * mirrors with I/O errors without considering the checksum. | ||
664 | * If the latter is the case, at the end, the checksum of the | ||
665 | * repaired area is verified in order to correctly maintain | ||
666 | * the statistics. | ||
667 | */ | ||
668 | |||
669 | sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS * | ||
670 | sizeof(*sblocks_for_recheck), | ||
671 | GFP_NOFS); | ||
672 | if (!sblocks_for_recheck) { | ||
673 | spin_lock(&sdev->stat_lock); | ||
674 | sdev->stat.malloc_errors++; | ||
675 | sdev->stat.read_errors++; | ||
676 | sdev->stat.uncorrectable_errors++; | ||
677 | spin_unlock(&sdev->stat_lock); | ||
678 | goto out; | ||
575 | } | 679 | } |
576 | 680 | ||
577 | spin_lock(&sdev->stat_lock); | 681 | /* setup the context, map the logical blocks and alloc the pages */ |
578 | ++sdev->stat.read_errors; | 682 | ret = scrub_setup_recheck_block(sdev, &fs_info->mapping_tree, length, |
579 | spin_unlock(&sdev->stat_lock); | 683 | logical, sblocks_for_recheck); |
684 | if (ret) { | ||
685 | spin_lock(&sdev->stat_lock); | ||
686 | sdev->stat.read_errors++; | ||
687 | sdev->stat.uncorrectable_errors++; | ||
688 | spin_unlock(&sdev->stat_lock); | ||
689 | goto out; | ||
690 | } | ||
691 | BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS); | ||
692 | sblock_bad = sblocks_for_recheck + failed_mirror_index; | ||
580 | 693 | ||
581 | scrub_fixup(sbio, ix); | 694 | /* build and submit the bios for the failed mirror, check checksums */ |
582 | return 1; | 695 | ret = scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, |
583 | } | 696 | csum, generation, sdev->csum_size); |
697 | if (ret) { | ||
698 | spin_lock(&sdev->stat_lock); | ||
699 | sdev->stat.read_errors++; | ||
700 | sdev->stat.uncorrectable_errors++; | ||
701 | spin_unlock(&sdev->stat_lock); | ||
702 | goto out; | ||
703 | } | ||
584 | 704 | ||
585 | static int scrub_fixup_check(struct scrub_bio *sbio, int ix) | 705 | if (!sblock_bad->header_error && !sblock_bad->checksum_error && |
586 | { | 706 | sblock_bad->no_io_error_seen) { |
587 | int ret = 1; | 707 | /* |
588 | struct page *page; | 708 | * the error disappeared after reading page by page, or |
589 | void *buffer; | 709 | * the area was part of a huge bio and other parts of the |
590 | u64 flags = sbio->spag[ix].flags; | 710 | * bio caused I/O errors, or the block layer merged several |
711 | * read requests into one and the error is caused by a | ||
712 | * different bio (usually one of the two latter cases is | ||
713 | * the cause) | ||
714 | */ | ||
715 | spin_lock(&sdev->stat_lock); | ||
716 | sdev->stat.unverified_errors++; | ||
717 | spin_unlock(&sdev->stat_lock); | ||
591 | 718 | ||
592 | page = sbio->bio->bi_io_vec[ix].bv_page; | 719 | goto out; |
593 | buffer = kmap_atomic(page, KM_USER0); | ||
594 | if (flags & BTRFS_EXTENT_FLAG_DATA) { | ||
595 | ret = scrub_checksum_data(sbio->sdev, | ||
596 | sbio->spag + ix, buffer); | ||
597 | } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { | ||
598 | ret = scrub_checksum_tree_block(sbio->sdev, | ||
599 | sbio->spag + ix, | ||
600 | sbio->logical + ix * PAGE_SIZE, | ||
601 | buffer); | ||
602 | } else { | ||
603 | WARN_ON(1); | ||
604 | } | 720 | } |
605 | kunmap_atomic(buffer, KM_USER0); | ||
606 | 721 | ||
607 | return ret; | 722 | if (!sblock_bad->no_io_error_seen) { |
608 | } | 723 | spin_lock(&sdev->stat_lock); |
724 | sdev->stat.read_errors++; | ||
725 | spin_unlock(&sdev->stat_lock); | ||
726 | if (__ratelimit(&_rs)) | ||
727 | scrub_print_warning("i/o error", sblock_to_check); | ||
728 | } else if (sblock_bad->checksum_error) { | ||
729 | spin_lock(&sdev->stat_lock); | ||
730 | sdev->stat.csum_errors++; | ||
731 | spin_unlock(&sdev->stat_lock); | ||
732 | if (__ratelimit(&_rs)) | ||
733 | scrub_print_warning("checksum error", sblock_to_check); | ||
734 | } else if (sblock_bad->header_error) { | ||
735 | spin_lock(&sdev->stat_lock); | ||
736 | sdev->stat.verify_errors++; | ||
737 | spin_unlock(&sdev->stat_lock); | ||
738 | if (__ratelimit(&_rs)) | ||
739 | scrub_print_warning("checksum/header error", | ||
740 | sblock_to_check); | ||
741 | } | ||
609 | 742 | ||
610 | static void scrub_fixup_end_io(struct bio *bio, int err) | 743 | if (sdev->readonly) |
611 | { | 744 | goto did_not_correct_error; |
612 | complete((struct completion *)bio->bi_private); | 745 | |
613 | } | 746 | if (!is_metadata && !have_csum) { |
747 | struct scrub_fixup_nodatasum *fixup_nodatasum; | ||
614 | 748 | ||
615 | static void scrub_fixup(struct scrub_bio *sbio, int ix) | 749 | /* |
616 | { | 750 | * !is_metadata and !have_csum, this means that the data |
617 | struct scrub_dev *sdev = sbio->sdev; | 751 | * might not be COW'ed, that it might be modified |
618 | struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; | 752 | * concurrently. The general strategy to work on the |
619 | struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; | 753 | * commit root does not help in the case when COW is not |
620 | struct btrfs_bio *bbio = NULL; | 754 | * used. |
621 | struct scrub_fixup_nodatasum *fixup; | 755 | */ |
622 | u64 logical = sbio->logical + ix * PAGE_SIZE; | 756 | fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS); |
623 | u64 length; | 757 | if (!fixup_nodatasum) |
624 | int i; | 758 | goto did_not_correct_error; |
625 | int ret; | 759 | fixup_nodatasum->sdev = sdev; |
626 | DECLARE_COMPLETION_ONSTACK(complete); | 760 | fixup_nodatasum->logical = logical; |
627 | 761 | fixup_nodatasum->root = fs_info->extent_root; | |
628 | if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) && | 762 | fixup_nodatasum->mirror_num = failed_mirror_index + 1; |
629 | (sbio->spag[ix].have_csum == 0)) { | ||
630 | fixup = kzalloc(sizeof(*fixup), GFP_NOFS); | ||
631 | if (!fixup) | ||
632 | goto uncorrectable; | ||
633 | fixup->sdev = sdev; | ||
634 | fixup->logical = logical; | ||
635 | fixup->root = fs_info->extent_root; | ||
636 | fixup->mirror_num = sbio->spag[ix].mirror_num; | ||
637 | /* | 763 | /* |
638 | * increment scrubs_running to prevent cancel requests from | 764 | * increment scrubs_running to prevent cancel requests from |
639 | * completing as long as a fixup worker is running. we must also | 765 | * completing as long as a fixup worker is running. we must also |
@@ -648,235 +774,529 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) | |||
648 | atomic_inc(&fs_info->scrubs_paused); | 774 | atomic_inc(&fs_info->scrubs_paused); |
649 | mutex_unlock(&fs_info->scrub_lock); | 775 | mutex_unlock(&fs_info->scrub_lock); |
650 | atomic_inc(&sdev->fixup_cnt); | 776 | atomic_inc(&sdev->fixup_cnt); |
651 | fixup->work.func = scrub_fixup_nodatasum; | 777 | fixup_nodatasum->work.func = scrub_fixup_nodatasum; |
652 | btrfs_queue_worker(&fs_info->scrub_workers, &fixup->work); | 778 | btrfs_queue_worker(&fs_info->scrub_workers, |
653 | return; | 779 | &fixup_nodatasum->work); |
780 | goto out; | ||
654 | } | 781 | } |
655 | 782 | ||
656 | length = PAGE_SIZE; | 783 | /* |
657 | ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, | 784 | * now build and submit the bios for the other mirrors, check |
658 | &bbio, 0); | 785 | * checksums |
659 | if (ret || !bbio || length < PAGE_SIZE) { | 786 | */ |
660 | printk(KERN_ERR | 787 | for (mirror_index = 0; |
661 | "scrub_fixup: btrfs_map_block failed us for %llu\n", | 788 | mirror_index < BTRFS_MAX_MIRRORS && |
662 | (unsigned long long)logical); | 789 | sblocks_for_recheck[mirror_index].page_count > 0; |
663 | WARN_ON(1); | 790 | mirror_index++) { |
664 | kfree(bbio); | 791 | if (mirror_index == failed_mirror_index) |
665 | return; | 792 | continue; |
793 | |||
794 | /* build and submit the bios, check checksums */ | ||
795 | ret = scrub_recheck_block(fs_info, | ||
796 | sblocks_for_recheck + mirror_index, | ||
797 | is_metadata, have_csum, csum, | ||
798 | generation, sdev->csum_size); | ||
799 | if (ret) | ||
800 | goto did_not_correct_error; | ||
666 | } | 801 | } |
667 | 802 | ||
668 | if (bbio->num_stripes == 1) | 803 | /* |
669 | /* there aren't any replicas */ | 804 | * first try to pick the mirror which is completely without I/O |
670 | goto uncorrectable; | 805 | * errors and also does not have a checksum error. |
806 | * If one is found, and if a checksum is present, the full block | ||
807 | * that is known to contain an error is rewritten. Afterwards | ||
808 | * the block is known to be corrected. | ||
809 | * If a mirror is found which is completely correct, and no | ||
810 | * checksum is present, only those pages are rewritten that had | ||
811 | * an I/O error in the block to be repaired, since it cannot be | ||
812 | * determined, which copy of the other pages is better (and it | ||
813 | * could happen otherwise that a correct page would be | ||
814 | * overwritten by a bad one). | ||
815 | */ | ||
816 | for (mirror_index = 0; | ||
817 | mirror_index < BTRFS_MAX_MIRRORS && | ||
818 | sblocks_for_recheck[mirror_index].page_count > 0; | ||
819 | mirror_index++) { | ||
820 | struct scrub_block *sblock_other = sblocks_for_recheck + | ||
821 | mirror_index; | ||
822 | |||
823 | if (!sblock_other->header_error && | ||
824 | !sblock_other->checksum_error && | ||
825 | sblock_other->no_io_error_seen) { | ||
826 | int force_write = is_metadata || have_csum; | ||
827 | |||
828 | ret = scrub_repair_block_from_good_copy(sblock_bad, | ||
829 | sblock_other, | ||
830 | force_write); | ||
831 | if (0 == ret) | ||
832 | goto corrected_error; | ||
833 | } | ||
834 | } | ||
671 | 835 | ||
672 | /* | 836 | /* |
673 | * first find a good copy | 837 | * in case of I/O errors in the area that is supposed to be |
838 | * repaired, continue by picking good copies of those pages. | ||
839 | * Select the good pages from mirrors to rewrite bad pages from | ||
840 | * the area to fix. Afterwards verify the checksum of the block | ||
841 | * that is supposed to be repaired. This verification step is | ||
842 | * only done for the purpose of statistic counting and for the | ||
843 | * final scrub report, whether errors remain. | ||
844 | * A perfect algorithm could make use of the checksum and try | ||
845 | * all possible combinations of pages from the different mirrors | ||
846 | * until the checksum verification succeeds. For example, when | ||
847 | * the 2nd page of mirror #1 faces I/O errors, and the 2nd page | ||
848 | * of mirror #2 is readable but the final checksum test fails, | ||
849 | * then the 2nd page of mirror #3 could be tried, whether now | ||
850 | * the final checksum succeedes. But this would be a rare | ||
851 | * exception and is therefore not implemented. At least it is | ||
852 | * avoided that the good copy is overwritten. | ||
853 | * A more useful improvement would be to pick the sectors | ||
854 | * without I/O error based on sector sizes (512 bytes on legacy | ||
855 | * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one | ||
856 | * mirror could be repaired by taking 512 byte of a different | ||
857 | * mirror, even if other 512 byte sectors in the same PAGE_SIZE | ||
858 | * area are unreadable. | ||
674 | */ | 859 | */ |
675 | for (i = 0; i < bbio->num_stripes; ++i) { | ||
676 | if (i + 1 == sbio->spag[ix].mirror_num) | ||
677 | continue; | ||
678 | 860 | ||
679 | if (scrub_fixup_io(READ, bbio->stripes[i].dev->bdev, | 861 | /* can only fix I/O errors from here on */ |
680 | bbio->stripes[i].physical >> 9, | 862 | if (sblock_bad->no_io_error_seen) |
681 | sbio->bio->bi_io_vec[ix].bv_page)) { | 863 | goto did_not_correct_error; |
682 | /* I/O-error, this is not a good copy */ | 864 | |
865 | success = 1; | ||
866 | for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { | ||
867 | struct scrub_page *page_bad = sblock_bad->pagev + page_num; | ||
868 | |||
869 | if (!page_bad->io_error) | ||
683 | continue; | 870 | continue; |
871 | |||
872 | for (mirror_index = 0; | ||
873 | mirror_index < BTRFS_MAX_MIRRORS && | ||
874 | sblocks_for_recheck[mirror_index].page_count > 0; | ||
875 | mirror_index++) { | ||
876 | struct scrub_block *sblock_other = sblocks_for_recheck + | ||
877 | mirror_index; | ||
878 | struct scrub_page *page_other = sblock_other->pagev + | ||
879 | page_num; | ||
880 | |||
881 | if (!page_other->io_error) { | ||
882 | ret = scrub_repair_page_from_good_copy( | ||
883 | sblock_bad, sblock_other, page_num, 0); | ||
884 | if (0 == ret) { | ||
885 | page_bad->io_error = 0; | ||
886 | break; /* succeeded for this page */ | ||
887 | } | ||
888 | } | ||
684 | } | 889 | } |
685 | 890 | ||
686 | if (scrub_fixup_check(sbio, ix) == 0) | 891 | if (page_bad->io_error) { |
687 | break; | 892 | /* did not find a mirror to copy the page from */ |
893 | success = 0; | ||
894 | } | ||
688 | } | 895 | } |
689 | if (i == bbio->num_stripes) | ||
690 | goto uncorrectable; | ||
691 | 896 | ||
692 | if (!sdev->readonly) { | 897 | if (success) { |
693 | /* | 898 | if (is_metadata || have_csum) { |
694 | * bi_io_vec[ix].bv_page now contains good data, write it back | 899 | /* |
695 | */ | 900 | * need to verify the checksum now that all |
696 | if (scrub_fixup_io(WRITE, sdev->dev->bdev, | 901 | * sectors on disk are repaired (the write |
697 | (sbio->physical + ix * PAGE_SIZE) >> 9, | 902 | * request for data to be repaired is on its way). |
698 | sbio->bio->bi_io_vec[ix].bv_page)) { | 903 | * Just be lazy and use scrub_recheck_block() |
699 | /* I/O-error, writeback failed, give up */ | 904 | * which re-reads the data before the checksum |
700 | goto uncorrectable; | 905 | * is verified, but most likely the data comes out |
906 | * of the page cache. | ||
907 | */ | ||
908 | ret = scrub_recheck_block(fs_info, sblock_bad, | ||
909 | is_metadata, have_csum, csum, | ||
910 | generation, sdev->csum_size); | ||
911 | if (!ret && !sblock_bad->header_error && | ||
912 | !sblock_bad->checksum_error && | ||
913 | sblock_bad->no_io_error_seen) | ||
914 | goto corrected_error; | ||
915 | else | ||
916 | goto did_not_correct_error; | ||
917 | } else { | ||
918 | corrected_error: | ||
919 | spin_lock(&sdev->stat_lock); | ||
920 | sdev->stat.corrected_errors++; | ||
921 | spin_unlock(&sdev->stat_lock); | ||
922 | printk_ratelimited(KERN_ERR | ||
923 | "btrfs: fixed up error at logical %llu on dev %s\n", | ||
924 | (unsigned long long)logical, sdev->dev->name); | ||
701 | } | 925 | } |
926 | } else { | ||
927 | did_not_correct_error: | ||
928 | spin_lock(&sdev->stat_lock); | ||
929 | sdev->stat.uncorrectable_errors++; | ||
930 | spin_unlock(&sdev->stat_lock); | ||
931 | printk_ratelimited(KERN_ERR | ||
932 | "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n", | ||
933 | (unsigned long long)logical, sdev->dev->name); | ||
702 | } | 934 | } |
703 | 935 | ||
704 | kfree(bbio); | 936 | out: |
705 | spin_lock(&sdev->stat_lock); | 937 | if (sblocks_for_recheck) { |
706 | ++sdev->stat.corrected_errors; | 938 | for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; |
707 | spin_unlock(&sdev->stat_lock); | 939 | mirror_index++) { |
940 | struct scrub_block *sblock = sblocks_for_recheck + | ||
941 | mirror_index; | ||
942 | int page_index; | ||
943 | |||
944 | for (page_index = 0; page_index < SCRUB_PAGES_PER_BIO; | ||
945 | page_index++) | ||
946 | if (sblock->pagev[page_index].page) | ||
947 | __free_page( | ||
948 | sblock->pagev[page_index].page); | ||
949 | } | ||
950 | kfree(sblocks_for_recheck); | ||
951 | } | ||
708 | 952 | ||
709 | printk_ratelimited(KERN_ERR "btrfs: fixed up error at logical %llu\n", | 953 | return 0; |
710 | (unsigned long long)logical); | 954 | } |
711 | return; | ||
712 | 955 | ||
713 | uncorrectable: | 956 | static int scrub_setup_recheck_block(struct scrub_dev *sdev, |
714 | kfree(bbio); | 957 | struct btrfs_mapping_tree *map_tree, |
715 | spin_lock(&sdev->stat_lock); | 958 | u64 length, u64 logical, |
716 | ++sdev->stat.uncorrectable_errors; | 959 | struct scrub_block *sblocks_for_recheck) |
717 | spin_unlock(&sdev->stat_lock); | 960 | { |
961 | int page_index; | ||
962 | int mirror_index; | ||
963 | int ret; | ||
964 | |||
965 | /* | ||
966 | * note: the three members sdev, ref_count and outstanding_pages | ||
967 | * are not used (and not set) in the blocks that are used for | ||
968 | * the recheck procedure | ||
969 | */ | ||
970 | |||
971 | page_index = 0; | ||
972 | while (length > 0) { | ||
973 | u64 sublen = min_t(u64, length, PAGE_SIZE); | ||
974 | u64 mapped_length = sublen; | ||
975 | struct btrfs_bio *bbio = NULL; | ||
718 | 976 | ||
719 | printk_ratelimited(KERN_ERR "btrfs: unable to fixup (regular) error at " | 977 | /* |
720 | "logical %llu\n", (unsigned long long)logical); | 978 | * with a length of PAGE_SIZE, each returned stripe |
979 | * represents one mirror | ||
980 | */ | ||
981 | ret = btrfs_map_block(map_tree, WRITE, logical, &mapped_length, | ||
982 | &bbio, 0); | ||
983 | if (ret || !bbio || mapped_length < sublen) { | ||
984 | kfree(bbio); | ||
985 | return -EIO; | ||
986 | } | ||
987 | |||
988 | BUG_ON(page_index >= SCRUB_PAGES_PER_BIO); | ||
989 | for (mirror_index = 0; mirror_index < (int)bbio->num_stripes; | ||
990 | mirror_index++) { | ||
991 | struct scrub_block *sblock; | ||
992 | struct scrub_page *page; | ||
993 | |||
994 | if (mirror_index >= BTRFS_MAX_MIRRORS) | ||
995 | continue; | ||
996 | |||
997 | sblock = sblocks_for_recheck + mirror_index; | ||
998 | page = sblock->pagev + page_index; | ||
999 | page->logical = logical; | ||
1000 | page->physical = bbio->stripes[mirror_index].physical; | ||
1001 | page->bdev = bbio->stripes[mirror_index].dev->bdev; | ||
1002 | page->mirror_num = mirror_index + 1; | ||
1003 | page->page = alloc_page(GFP_NOFS); | ||
1004 | if (!page->page) { | ||
1005 | spin_lock(&sdev->stat_lock); | ||
1006 | sdev->stat.malloc_errors++; | ||
1007 | spin_unlock(&sdev->stat_lock); | ||
1008 | return -ENOMEM; | ||
1009 | } | ||
1010 | sblock->page_count++; | ||
1011 | } | ||
1012 | kfree(bbio); | ||
1013 | length -= sublen; | ||
1014 | logical += sublen; | ||
1015 | page_index++; | ||
1016 | } | ||
1017 | |||
1018 | return 0; | ||
721 | } | 1019 | } |
722 | 1020 | ||
723 | static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, | 1021 | /* |
724 | struct page *page) | 1022 | * this function will check the on disk data for checksum errors, header |
1023 | * errors and read I/O errors. If any I/O errors happen, the exact pages | ||
1024 | * which are errored are marked as being bad. The goal is to enable scrub | ||
1025 | * to take those pages that are not errored from all the mirrors so that | ||
1026 | * the pages that are errored in the just handled mirror can be repaired. | ||
1027 | */ | ||
1028 | static int scrub_recheck_block(struct btrfs_fs_info *fs_info, | ||
1029 | struct scrub_block *sblock, int is_metadata, | ||
1030 | int have_csum, u8 *csum, u64 generation, | ||
1031 | u16 csum_size) | ||
725 | { | 1032 | { |
726 | struct bio *bio = NULL; | 1033 | int page_num; |
727 | int ret; | ||
728 | DECLARE_COMPLETION_ONSTACK(complete); | ||
729 | 1034 | ||
730 | bio = bio_alloc(GFP_NOFS, 1); | 1035 | sblock->no_io_error_seen = 1; |
731 | bio->bi_bdev = bdev; | 1036 | sblock->header_error = 0; |
732 | bio->bi_sector = sector; | 1037 | sblock->checksum_error = 0; |
733 | bio_add_page(bio, page, PAGE_SIZE, 0); | ||
734 | bio->bi_end_io = scrub_fixup_end_io; | ||
735 | bio->bi_private = &complete; | ||
736 | btrfsic_submit_bio(rw, bio); | ||
737 | 1038 | ||
738 | /* this will also unplug the queue */ | 1039 | for (page_num = 0; page_num < sblock->page_count; page_num++) { |
739 | wait_for_completion(&complete); | 1040 | struct bio *bio; |
1041 | int ret; | ||
1042 | struct scrub_page *page = sblock->pagev + page_num; | ||
1043 | DECLARE_COMPLETION_ONSTACK(complete); | ||
1044 | |||
1045 | BUG_ON(!page->page); | ||
1046 | bio = bio_alloc(GFP_NOFS, 1); | ||
1047 | bio->bi_bdev = page->bdev; | ||
1048 | bio->bi_sector = page->physical >> 9; | ||
1049 | bio->bi_end_io = scrub_complete_bio_end_io; | ||
1050 | bio->bi_private = &complete; | ||
1051 | |||
1052 | ret = bio_add_page(bio, page->page, PAGE_SIZE, 0); | ||
1053 | if (PAGE_SIZE != ret) { | ||
1054 | bio_put(bio); | ||
1055 | return -EIO; | ||
1056 | } | ||
1057 | btrfsic_submit_bio(READ, bio); | ||
740 | 1058 | ||
741 | ret = !test_bit(BIO_UPTODATE, &bio->bi_flags); | 1059 | /* this will also unplug the queue */ |
742 | bio_put(bio); | 1060 | wait_for_completion(&complete); |
743 | return ret; | 1061 | |
1062 | page->io_error = !test_bit(BIO_UPTODATE, &bio->bi_flags); | ||
1063 | if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) | ||
1064 | sblock->no_io_error_seen = 0; | ||
1065 | bio_put(bio); | ||
1066 | } | ||
1067 | |||
1068 | if (sblock->no_io_error_seen) | ||
1069 | scrub_recheck_block_checksum(fs_info, sblock, is_metadata, | ||
1070 | have_csum, csum, generation, | ||
1071 | csum_size); | ||
1072 | |||
1073 | return 0; | ||
744 | } | 1074 | } |
745 | 1075 | ||
746 | static void scrub_bio_end_io(struct bio *bio, int err) | 1076 | static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, |
1077 | struct scrub_block *sblock, | ||
1078 | int is_metadata, int have_csum, | ||
1079 | const u8 *csum, u64 generation, | ||
1080 | u16 csum_size) | ||
747 | { | 1081 | { |
748 | struct scrub_bio *sbio = bio->bi_private; | 1082 | int page_num; |
749 | struct scrub_dev *sdev = sbio->sdev; | 1083 | u8 calculated_csum[BTRFS_CSUM_SIZE]; |
750 | struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; | 1084 | u32 crc = ~(u32)0; |
1085 | struct btrfs_root *root = fs_info->extent_root; | ||
1086 | void *mapped_buffer; | ||
1087 | |||
1088 | BUG_ON(!sblock->pagev[0].page); | ||
1089 | if (is_metadata) { | ||
1090 | struct btrfs_header *h; | ||
1091 | |||
1092 | mapped_buffer = kmap_atomic(sblock->pagev[0].page, KM_USER0); | ||
1093 | h = (struct btrfs_header *)mapped_buffer; | ||
1094 | |||
1095 | if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) || | ||
1096 | generation != le64_to_cpu(h->generation) || | ||
1097 | memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) || | ||
1098 | memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, | ||
1099 | BTRFS_UUID_SIZE)) | ||
1100 | sblock->header_error = 1; | ||
1101 | csum = h->csum; | ||
1102 | } else { | ||
1103 | if (!have_csum) | ||
1104 | return; | ||
751 | 1105 | ||
752 | sbio->err = err; | 1106 | mapped_buffer = kmap_atomic(sblock->pagev[0].page, KM_USER0); |
753 | sbio->bio = bio; | 1107 | } |
754 | 1108 | ||
755 | btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work); | 1109 | for (page_num = 0;;) { |
1110 | if (page_num == 0 && is_metadata) | ||
1111 | crc = btrfs_csum_data(root, | ||
1112 | ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE, | ||
1113 | crc, PAGE_SIZE - BTRFS_CSUM_SIZE); | ||
1114 | else | ||
1115 | crc = btrfs_csum_data(root, mapped_buffer, crc, | ||
1116 | PAGE_SIZE); | ||
1117 | |||
1118 | kunmap_atomic(mapped_buffer, KM_USER0); | ||
1119 | page_num++; | ||
1120 | if (page_num >= sblock->page_count) | ||
1121 | break; | ||
1122 | BUG_ON(!sblock->pagev[page_num].page); | ||
1123 | |||
1124 | mapped_buffer = kmap_atomic(sblock->pagev[page_num].page, | ||
1125 | KM_USER0); | ||
1126 | } | ||
1127 | |||
1128 | btrfs_csum_final(crc, calculated_csum); | ||
1129 | if (memcmp(calculated_csum, csum, csum_size)) | ||
1130 | sblock->checksum_error = 1; | ||
756 | } | 1131 | } |
757 | 1132 | ||
758 | static void scrub_checksum(struct btrfs_work *work) | 1133 | static void scrub_complete_bio_end_io(struct bio *bio, int err) |
759 | { | 1134 | { |
760 | struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); | 1135 | complete((struct completion *)bio->bi_private); |
761 | struct scrub_dev *sdev = sbio->sdev; | 1136 | } |
762 | struct page *page; | ||
763 | void *buffer; | ||
764 | int i; | ||
765 | u64 flags; | ||
766 | u64 logical; | ||
767 | int ret; | ||
768 | 1137 | ||
769 | if (sbio->err) { | 1138 | static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, |
770 | ret = 0; | 1139 | struct scrub_block *sblock_good, |
771 | for (i = 0; i < sbio->count; ++i) | 1140 | int force_write) |
772 | ret |= scrub_recheck_error(sbio, i); | 1141 | { |
773 | if (!ret) { | 1142 | int page_num; |
774 | spin_lock(&sdev->stat_lock); | 1143 | int ret = 0; |
775 | ++sdev->stat.unverified_errors; | ||
776 | spin_unlock(&sdev->stat_lock); | ||
777 | } | ||
778 | 1144 | ||
779 | sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1); | 1145 | for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { |
780 | sbio->bio->bi_flags |= 1 << BIO_UPTODATE; | 1146 | int ret_sub; |
781 | sbio->bio->bi_phys_segments = 0; | ||
782 | sbio->bio->bi_idx = 0; | ||
783 | 1147 | ||
784 | for (i = 0; i < sbio->count; i++) { | 1148 | ret_sub = scrub_repair_page_from_good_copy(sblock_bad, |
785 | struct bio_vec *bi; | 1149 | sblock_good, |
786 | bi = &sbio->bio->bi_io_vec[i]; | 1150 | page_num, |
787 | bi->bv_offset = 0; | 1151 | force_write); |
788 | bi->bv_len = PAGE_SIZE; | 1152 | if (ret_sub) |
789 | } | 1153 | ret = ret_sub; |
790 | goto out; | ||
791 | } | 1154 | } |
792 | for (i = 0; i < sbio->count; ++i) { | 1155 | |
793 | page = sbio->bio->bi_io_vec[i].bv_page; | 1156 | return ret; |
794 | buffer = kmap_atomic(page, KM_USER0); | 1157 | } |
795 | flags = sbio->spag[i].flags; | 1158 | |
796 | logical = sbio->logical + i * PAGE_SIZE; | 1159 | static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, |
797 | ret = 0; | 1160 | struct scrub_block *sblock_good, |
798 | if (flags & BTRFS_EXTENT_FLAG_DATA) { | 1161 | int page_num, int force_write) |
799 | ret = scrub_checksum_data(sdev, sbio->spag + i, buffer); | 1162 | { |
800 | } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { | 1163 | struct scrub_page *page_bad = sblock_bad->pagev + page_num; |
801 | ret = scrub_checksum_tree_block(sdev, sbio->spag + i, | 1164 | struct scrub_page *page_good = sblock_good->pagev + page_num; |
802 | logical, buffer); | 1165 | |
803 | } else if (flags & BTRFS_EXTENT_FLAG_SUPER) { | 1166 | BUG_ON(sblock_bad->pagev[page_num].page == NULL); |
804 | BUG_ON(i); | 1167 | BUG_ON(sblock_good->pagev[page_num].page == NULL); |
805 | (void)scrub_checksum_super(sbio, buffer); | 1168 | if (force_write || sblock_bad->header_error || |
806 | } else { | 1169 | sblock_bad->checksum_error || page_bad->io_error) { |
807 | WARN_ON(1); | 1170 | struct bio *bio; |
808 | } | 1171 | int ret; |
809 | kunmap_atomic(buffer, KM_USER0); | 1172 | DECLARE_COMPLETION_ONSTACK(complete); |
810 | if (ret) { | 1173 | |
811 | ret = scrub_recheck_error(sbio, i); | 1174 | bio = bio_alloc(GFP_NOFS, 1); |
812 | if (!ret) { | 1175 | bio->bi_bdev = page_bad->bdev; |
813 | spin_lock(&sdev->stat_lock); | 1176 | bio->bi_sector = page_bad->physical >> 9; |
814 | ++sdev->stat.unverified_errors; | 1177 | bio->bi_end_io = scrub_complete_bio_end_io; |
815 | spin_unlock(&sdev->stat_lock); | 1178 | bio->bi_private = &complete; |
816 | } | 1179 | |
1180 | ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0); | ||
1181 | if (PAGE_SIZE != ret) { | ||
1182 | bio_put(bio); | ||
1183 | return -EIO; | ||
817 | } | 1184 | } |
1185 | btrfsic_submit_bio(WRITE, bio); | ||
1186 | |||
1187 | /* this will also unplug the queue */ | ||
1188 | wait_for_completion(&complete); | ||
1189 | bio_put(bio); | ||
818 | } | 1190 | } |
819 | 1191 | ||
820 | out: | 1192 | return 0; |
821 | scrub_free_bio(sbio->bio); | 1193 | } |
822 | sbio->bio = NULL; | 1194 | |
823 | spin_lock(&sdev->list_lock); | 1195 | static void scrub_checksum(struct scrub_block *sblock) |
824 | sbio->next_free = sdev->first_free; | 1196 | { |
825 | sdev->first_free = sbio->index; | 1197 | u64 flags; |
826 | spin_unlock(&sdev->list_lock); | 1198 | int ret; |
827 | atomic_dec(&sdev->in_flight); | 1199 | |
828 | wake_up(&sdev->list_wait); | 1200 | BUG_ON(sblock->page_count < 1); |
1201 | flags = sblock->pagev[0].flags; | ||
1202 | ret = 0; | ||
1203 | if (flags & BTRFS_EXTENT_FLAG_DATA) | ||
1204 | ret = scrub_checksum_data(sblock); | ||
1205 | else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) | ||
1206 | ret = scrub_checksum_tree_block(sblock); | ||
1207 | else if (flags & BTRFS_EXTENT_FLAG_SUPER) | ||
1208 | (void)scrub_checksum_super(sblock); | ||
1209 | else | ||
1210 | WARN_ON(1); | ||
1211 | if (ret) | ||
1212 | scrub_handle_errored_block(sblock); | ||
829 | } | 1213 | } |
830 | 1214 | ||
831 | static int scrub_checksum_data(struct scrub_dev *sdev, | 1215 | static int scrub_checksum_data(struct scrub_block *sblock) |
832 | struct scrub_page *spag, void *buffer) | ||
833 | { | 1216 | { |
1217 | struct scrub_dev *sdev = sblock->sdev; | ||
834 | u8 csum[BTRFS_CSUM_SIZE]; | 1218 | u8 csum[BTRFS_CSUM_SIZE]; |
1219 | u8 *on_disk_csum; | ||
1220 | struct page *page; | ||
1221 | void *buffer; | ||
835 | u32 crc = ~(u32)0; | 1222 | u32 crc = ~(u32)0; |
836 | int fail = 0; | 1223 | int fail = 0; |
837 | struct btrfs_root *root = sdev->dev->dev_root; | 1224 | struct btrfs_root *root = sdev->dev->dev_root; |
1225 | u64 len; | ||
1226 | int index; | ||
838 | 1227 | ||
839 | if (!spag->have_csum) | 1228 | BUG_ON(sblock->page_count < 1); |
1229 | if (!sblock->pagev[0].have_csum) | ||
840 | return 0; | 1230 | return 0; |
841 | 1231 | ||
842 | crc = btrfs_csum_data(root, buffer, crc, PAGE_SIZE); | 1232 | on_disk_csum = sblock->pagev[0].csum; |
1233 | page = sblock->pagev[0].page; | ||
1234 | buffer = kmap_atomic(page, KM_USER0); | ||
1235 | |||
1236 | len = sdev->sectorsize; | ||
1237 | index = 0; | ||
1238 | for (;;) { | ||
1239 | u64 l = min_t(u64, len, PAGE_SIZE); | ||
1240 | |||
1241 | crc = btrfs_csum_data(root, buffer, crc, l); | ||
1242 | kunmap_atomic(buffer, KM_USER0); | ||
1243 | len -= l; | ||
1244 | if (len == 0) | ||
1245 | break; | ||
1246 | index++; | ||
1247 | BUG_ON(index >= sblock->page_count); | ||
1248 | BUG_ON(!sblock->pagev[index].page); | ||
1249 | page = sblock->pagev[index].page; | ||
1250 | buffer = kmap_atomic(page, KM_USER0); | ||
1251 | } | ||
1252 | |||
843 | btrfs_csum_final(crc, csum); | 1253 | btrfs_csum_final(crc, csum); |
844 | if (memcmp(csum, spag->csum, sdev->csum_size)) | 1254 | if (memcmp(csum, on_disk_csum, sdev->csum_size)) |
845 | fail = 1; | 1255 | fail = 1; |
846 | 1256 | ||
847 | spin_lock(&sdev->stat_lock); | 1257 | if (fail) { |
848 | ++sdev->stat.data_extents_scrubbed; | 1258 | spin_lock(&sdev->stat_lock); |
849 | sdev->stat.data_bytes_scrubbed += PAGE_SIZE; | ||
850 | if (fail) | ||
851 | ++sdev->stat.csum_errors; | 1259 | ++sdev->stat.csum_errors; |
852 | spin_unlock(&sdev->stat_lock); | 1260 | spin_unlock(&sdev->stat_lock); |
1261 | } | ||
853 | 1262 | ||
854 | return fail; | 1263 | return fail; |
855 | } | 1264 | } |
856 | 1265 | ||
857 | static int scrub_checksum_tree_block(struct scrub_dev *sdev, | 1266 | static int scrub_checksum_tree_block(struct scrub_block *sblock) |
858 | struct scrub_page *spag, u64 logical, | ||
859 | void *buffer) | ||
860 | { | 1267 | { |
1268 | struct scrub_dev *sdev = sblock->sdev; | ||
861 | struct btrfs_header *h; | 1269 | struct btrfs_header *h; |
862 | struct btrfs_root *root = sdev->dev->dev_root; | 1270 | struct btrfs_root *root = sdev->dev->dev_root; |
863 | struct btrfs_fs_info *fs_info = root->fs_info; | 1271 | struct btrfs_fs_info *fs_info = root->fs_info; |
864 | u8 csum[BTRFS_CSUM_SIZE]; | 1272 | u8 calculated_csum[BTRFS_CSUM_SIZE]; |
1273 | u8 on_disk_csum[BTRFS_CSUM_SIZE]; | ||
1274 | struct page *page; | ||
1275 | void *mapped_buffer; | ||
1276 | u64 mapped_size; | ||
1277 | void *p; | ||
865 | u32 crc = ~(u32)0; | 1278 | u32 crc = ~(u32)0; |
866 | int fail = 0; | 1279 | int fail = 0; |
867 | int crc_fail = 0; | 1280 | int crc_fail = 0; |
1281 | u64 len; | ||
1282 | int index; | ||
1283 | |||
1284 | BUG_ON(sblock->page_count < 1); | ||
1285 | page = sblock->pagev[0].page; | ||
1286 | mapped_buffer = kmap_atomic(page, KM_USER0); | ||
1287 | h = (struct btrfs_header *)mapped_buffer; | ||
1288 | memcpy(on_disk_csum, h->csum, sdev->csum_size); | ||
868 | 1289 | ||
869 | /* | 1290 | /* |
870 | * we don't use the getter functions here, as we | 1291 | * we don't use the getter functions here, as we |
871 | * a) don't have an extent buffer and | 1292 | * a) don't have an extent buffer and |
872 | * b) the page is already kmapped | 1293 | * b) the page is already kmapped |
873 | */ | 1294 | */ |
874 | h = (struct btrfs_header *)buffer; | ||
875 | 1295 | ||
876 | if (logical != le64_to_cpu(h->bytenr)) | 1296 | if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr)) |
877 | ++fail; | 1297 | ++fail; |
878 | 1298 | ||
879 | if (spag->generation != le64_to_cpu(h->generation)) | 1299 | if (sblock->pagev[0].generation != le64_to_cpu(h->generation)) |
880 | ++fail; | 1300 | ++fail; |
881 | 1301 | ||
882 | if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) | 1302 | if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) |
@@ -886,51 +1306,99 @@ static int scrub_checksum_tree_block(struct scrub_dev *sdev, | |||
886 | BTRFS_UUID_SIZE)) | 1306 | BTRFS_UUID_SIZE)) |
887 | ++fail; | 1307 | ++fail; |
888 | 1308 | ||
889 | crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc, | 1309 | BUG_ON(sdev->nodesize != sdev->leafsize); |
890 | PAGE_SIZE - BTRFS_CSUM_SIZE); | 1310 | len = sdev->nodesize - BTRFS_CSUM_SIZE; |
891 | btrfs_csum_final(crc, csum); | 1311 | mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; |
892 | if (memcmp(csum, h->csum, sdev->csum_size)) | 1312 | p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; |
1313 | index = 0; | ||
1314 | for (;;) { | ||
1315 | u64 l = min_t(u64, len, mapped_size); | ||
1316 | |||
1317 | crc = btrfs_csum_data(root, p, crc, l); | ||
1318 | kunmap_atomic(mapped_buffer, KM_USER0); | ||
1319 | len -= l; | ||
1320 | if (len == 0) | ||
1321 | break; | ||
1322 | index++; | ||
1323 | BUG_ON(index >= sblock->page_count); | ||
1324 | BUG_ON(!sblock->pagev[index].page); | ||
1325 | page = sblock->pagev[index].page; | ||
1326 | mapped_buffer = kmap_atomic(page, KM_USER0); | ||
1327 | mapped_size = PAGE_SIZE; | ||
1328 | p = mapped_buffer; | ||
1329 | } | ||
1330 | |||
1331 | btrfs_csum_final(crc, calculated_csum); | ||
1332 | if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) | ||
893 | ++crc_fail; | 1333 | ++crc_fail; |
894 | 1334 | ||
895 | spin_lock(&sdev->stat_lock); | 1335 | if (crc_fail || fail) { |
896 | ++sdev->stat.tree_extents_scrubbed; | 1336 | spin_lock(&sdev->stat_lock); |
897 | sdev->stat.tree_bytes_scrubbed += PAGE_SIZE; | 1337 | if (crc_fail) |
898 | if (crc_fail) | 1338 | ++sdev->stat.csum_errors; |
899 | ++sdev->stat.csum_errors; | 1339 | if (fail) |
900 | if (fail) | 1340 | ++sdev->stat.verify_errors; |
901 | ++sdev->stat.verify_errors; | 1341 | spin_unlock(&sdev->stat_lock); |
902 | spin_unlock(&sdev->stat_lock); | 1342 | } |
903 | 1343 | ||
904 | return fail || crc_fail; | 1344 | return fail || crc_fail; |
905 | } | 1345 | } |
906 | 1346 | ||
907 | static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer) | 1347 | static int scrub_checksum_super(struct scrub_block *sblock) |
908 | { | 1348 | { |
909 | struct btrfs_super_block *s; | 1349 | struct btrfs_super_block *s; |
910 | u64 logical; | 1350 | struct scrub_dev *sdev = sblock->sdev; |
911 | struct scrub_dev *sdev = sbio->sdev; | ||
912 | struct btrfs_root *root = sdev->dev->dev_root; | 1351 | struct btrfs_root *root = sdev->dev->dev_root; |
913 | struct btrfs_fs_info *fs_info = root->fs_info; | 1352 | struct btrfs_fs_info *fs_info = root->fs_info; |
914 | u8 csum[BTRFS_CSUM_SIZE]; | 1353 | u8 calculated_csum[BTRFS_CSUM_SIZE]; |
1354 | u8 on_disk_csum[BTRFS_CSUM_SIZE]; | ||
1355 | struct page *page; | ||
1356 | void *mapped_buffer; | ||
1357 | u64 mapped_size; | ||
1358 | void *p; | ||
915 | u32 crc = ~(u32)0; | 1359 | u32 crc = ~(u32)0; |
916 | int fail = 0; | 1360 | int fail = 0; |
1361 | u64 len; | ||
1362 | int index; | ||
917 | 1363 | ||
918 | s = (struct btrfs_super_block *)buffer; | 1364 | BUG_ON(sblock->page_count < 1); |
919 | logical = sbio->logical; | 1365 | page = sblock->pagev[0].page; |
1366 | mapped_buffer = kmap_atomic(page, KM_USER0); | ||
1367 | s = (struct btrfs_super_block *)mapped_buffer; | ||
1368 | memcpy(on_disk_csum, s->csum, sdev->csum_size); | ||
920 | 1369 | ||
921 | if (logical != le64_to_cpu(s->bytenr)) | 1370 | if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr)) |
922 | ++fail; | 1371 | ++fail; |
923 | 1372 | ||
924 | if (sbio->spag[0].generation != le64_to_cpu(s->generation)) | 1373 | if (sblock->pagev[0].generation != le64_to_cpu(s->generation)) |
925 | ++fail; | 1374 | ++fail; |
926 | 1375 | ||
927 | if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) | 1376 | if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) |
928 | ++fail; | 1377 | ++fail; |
929 | 1378 | ||
930 | crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc, | 1379 | len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE; |
931 | PAGE_SIZE - BTRFS_CSUM_SIZE); | 1380 | mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; |
932 | btrfs_csum_final(crc, csum); | 1381 | p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; |
933 | if (memcmp(csum, s->csum, sbio->sdev->csum_size)) | 1382 | index = 0; |
1383 | for (;;) { | ||
1384 | u64 l = min_t(u64, len, mapped_size); | ||
1385 | |||
1386 | crc = btrfs_csum_data(root, p, crc, l); | ||
1387 | kunmap_atomic(mapped_buffer, KM_USER0); | ||
1388 | len -= l; | ||
1389 | if (len == 0) | ||
1390 | break; | ||
1391 | index++; | ||
1392 | BUG_ON(index >= sblock->page_count); | ||
1393 | BUG_ON(!sblock->pagev[index].page); | ||
1394 | page = sblock->pagev[index].page; | ||
1395 | mapped_buffer = kmap_atomic(page, KM_USER0); | ||
1396 | mapped_size = PAGE_SIZE; | ||
1397 | p = mapped_buffer; | ||
1398 | } | ||
1399 | |||
1400 | btrfs_csum_final(crc, calculated_csum); | ||
1401 | if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) | ||
934 | ++fail; | 1402 | ++fail; |
935 | 1403 | ||
936 | if (fail) { | 1404 | if (fail) { |
@@ -947,6 +1415,23 @@ static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer) | |||
947 | return fail; | 1415 | return fail; |
948 | } | 1416 | } |
949 | 1417 | ||
1418 | static void scrub_block_get(struct scrub_block *sblock) | ||
1419 | { | ||
1420 | atomic_inc(&sblock->ref_count); | ||
1421 | } | ||
1422 | |||
1423 | static void scrub_block_put(struct scrub_block *sblock) | ||
1424 | { | ||
1425 | if (atomic_dec_and_test(&sblock->ref_count)) { | ||
1426 | int i; | ||
1427 | |||
1428 | for (i = 0; i < sblock->page_count; i++) | ||
1429 | if (sblock->pagev[i].page) | ||
1430 | __free_page(sblock->pagev[i].page); | ||
1431 | kfree(sblock); | ||
1432 | } | ||
1433 | } | ||
1434 | |||
950 | static void scrub_submit(struct scrub_dev *sdev) | 1435 | static void scrub_submit(struct scrub_dev *sdev) |
951 | { | 1436 | { |
952 | struct scrub_bio *sbio; | 1437 | struct scrub_bio *sbio; |
@@ -955,19 +1440,17 @@ static void scrub_submit(struct scrub_dev *sdev) | |||
955 | return; | 1440 | return; |
956 | 1441 | ||
957 | sbio = sdev->bios[sdev->curr]; | 1442 | sbio = sdev->bios[sdev->curr]; |
958 | sbio->err = 0; | ||
959 | sdev->curr = -1; | 1443 | sdev->curr = -1; |
960 | atomic_inc(&sdev->in_flight); | 1444 | atomic_inc(&sdev->in_flight); |
961 | 1445 | ||
962 | btrfsic_submit_bio(READ, sbio->bio); | 1446 | btrfsic_submit_bio(READ, sbio->bio); |
963 | } | 1447 | } |
964 | 1448 | ||
965 | static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len, | 1449 | static int scrub_add_page_to_bio(struct scrub_dev *sdev, |
966 | u64 physical, u64 flags, u64 gen, int mirror_num, | 1450 | struct scrub_page *spage) |
967 | u8 *csum, int force) | ||
968 | { | 1451 | { |
1452 | struct scrub_block *sblock = spage->sblock; | ||
969 | struct scrub_bio *sbio; | 1453 | struct scrub_bio *sbio; |
970 | struct page *page; | ||
971 | int ret; | 1454 | int ret; |
972 | 1455 | ||
973 | again: | 1456 | again: |
@@ -980,7 +1463,7 @@ again: | |||
980 | if (sdev->curr != -1) { | 1463 | if (sdev->curr != -1) { |
981 | sdev->first_free = sdev->bios[sdev->curr]->next_free; | 1464 | sdev->first_free = sdev->bios[sdev->curr]->next_free; |
982 | sdev->bios[sdev->curr]->next_free = -1; | 1465 | sdev->bios[sdev->curr]->next_free = -1; |
983 | sdev->bios[sdev->curr]->count = 0; | 1466 | sdev->bios[sdev->curr]->page_count = 0; |
984 | spin_unlock(&sdev->list_lock); | 1467 | spin_unlock(&sdev->list_lock); |
985 | } else { | 1468 | } else { |
986 | spin_unlock(&sdev->list_lock); | 1469 | spin_unlock(&sdev->list_lock); |
@@ -988,53 +1471,200 @@ again: | |||
988 | } | 1471 | } |
989 | } | 1472 | } |
990 | sbio = sdev->bios[sdev->curr]; | 1473 | sbio = sdev->bios[sdev->curr]; |
991 | if (sbio->count == 0) { | 1474 | if (sbio->page_count == 0) { |
992 | struct bio *bio; | 1475 | struct bio *bio; |
993 | 1476 | ||
994 | sbio->physical = physical; | 1477 | sbio->physical = spage->physical; |
995 | sbio->logical = logical; | 1478 | sbio->logical = spage->logical; |
996 | bio = bio_alloc(GFP_NOFS, SCRUB_PAGES_PER_BIO); | 1479 | bio = sbio->bio; |
997 | if (!bio) | 1480 | if (!bio) { |
998 | return -ENOMEM; | 1481 | bio = bio_alloc(GFP_NOFS, sdev->pages_per_bio); |
1482 | if (!bio) | ||
1483 | return -ENOMEM; | ||
1484 | sbio->bio = bio; | ||
1485 | } | ||
999 | 1486 | ||
1000 | bio->bi_private = sbio; | 1487 | bio->bi_private = sbio; |
1001 | bio->bi_end_io = scrub_bio_end_io; | 1488 | bio->bi_end_io = scrub_bio_end_io; |
1002 | bio->bi_bdev = sdev->dev->bdev; | 1489 | bio->bi_bdev = sdev->dev->bdev; |
1003 | bio->bi_sector = sbio->physical >> 9; | 1490 | bio->bi_sector = spage->physical >> 9; |
1004 | sbio->err = 0; | 1491 | sbio->err = 0; |
1005 | sbio->bio = bio; | 1492 | } else if (sbio->physical + sbio->page_count * PAGE_SIZE != |
1006 | } else if (sbio->physical + sbio->count * PAGE_SIZE != physical || | 1493 | spage->physical || |
1007 | sbio->logical + sbio->count * PAGE_SIZE != logical) { | 1494 | sbio->logical + sbio->page_count * PAGE_SIZE != |
1495 | spage->logical) { | ||
1008 | scrub_submit(sdev); | 1496 | scrub_submit(sdev); |
1009 | goto again; | 1497 | goto again; |
1010 | } | 1498 | } |
1011 | sbio->spag[sbio->count].flags = flags; | ||
1012 | sbio->spag[sbio->count].generation = gen; | ||
1013 | sbio->spag[sbio->count].have_csum = 0; | ||
1014 | sbio->spag[sbio->count].mirror_num = mirror_num; | ||
1015 | |||
1016 | page = alloc_page(GFP_NOFS); | ||
1017 | if (!page) | ||
1018 | return -ENOMEM; | ||
1019 | 1499 | ||
1020 | ret = bio_add_page(sbio->bio, page, PAGE_SIZE, 0); | 1500 | sbio->pagev[sbio->page_count] = spage; |
1021 | if (!ret) { | 1501 | ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0); |
1022 | __free_page(page); | 1502 | if (ret != PAGE_SIZE) { |
1503 | if (sbio->page_count < 1) { | ||
1504 | bio_put(sbio->bio); | ||
1505 | sbio->bio = NULL; | ||
1506 | return -EIO; | ||
1507 | } | ||
1023 | scrub_submit(sdev); | 1508 | scrub_submit(sdev); |
1024 | goto again; | 1509 | goto again; |
1025 | } | 1510 | } |
1026 | 1511 | ||
1027 | if (csum) { | 1512 | scrub_block_get(sblock); /* one for the added page */ |
1028 | sbio->spag[sbio->count].have_csum = 1; | 1513 | atomic_inc(&sblock->outstanding_pages); |
1029 | memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size); | 1514 | sbio->page_count++; |
1515 | if (sbio->page_count == sdev->pages_per_bio) | ||
1516 | scrub_submit(sdev); | ||
1517 | |||
1518 | return 0; | ||
1519 | } | ||
1520 | |||
1521 | static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, | ||
1522 | u64 physical, u64 flags, u64 gen, int mirror_num, | ||
1523 | u8 *csum, int force) | ||
1524 | { | ||
1525 | struct scrub_block *sblock; | ||
1526 | int index; | ||
1527 | |||
1528 | sblock = kzalloc(sizeof(*sblock), GFP_NOFS); | ||
1529 | if (!sblock) { | ||
1530 | spin_lock(&sdev->stat_lock); | ||
1531 | sdev->stat.malloc_errors++; | ||
1532 | spin_unlock(&sdev->stat_lock); | ||
1533 | return -ENOMEM; | ||
1534 | } | ||
1535 | |||
1536 | /* one ref inside this function, plus one for each page later on */ | ||
1537 | atomic_set(&sblock->ref_count, 1); | ||
1538 | sblock->sdev = sdev; | ||
1539 | sblock->no_io_error_seen = 1; | ||
1540 | |||
1541 | for (index = 0; len > 0; index++) { | ||
1542 | struct scrub_page *spage = sblock->pagev + index; | ||
1543 | u64 l = min_t(u64, len, PAGE_SIZE); | ||
1544 | |||
1545 | BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); | ||
1546 | spage->page = alloc_page(GFP_NOFS); | ||
1547 | if (!spage->page) { | ||
1548 | spin_lock(&sdev->stat_lock); | ||
1549 | sdev->stat.malloc_errors++; | ||
1550 | spin_unlock(&sdev->stat_lock); | ||
1551 | while (index > 0) { | ||
1552 | index--; | ||
1553 | __free_page(sblock->pagev[index].page); | ||
1554 | } | ||
1555 | kfree(sblock); | ||
1556 | return -ENOMEM; | ||
1557 | } | ||
1558 | spage->sblock = sblock; | ||
1559 | spage->bdev = sdev->dev->bdev; | ||
1560 | spage->flags = flags; | ||
1561 | spage->generation = gen; | ||
1562 | spage->logical = logical; | ||
1563 | spage->physical = physical; | ||
1564 | spage->mirror_num = mirror_num; | ||
1565 | if (csum) { | ||
1566 | spage->have_csum = 1; | ||
1567 | memcpy(spage->csum, csum, sdev->csum_size); | ||
1568 | } else { | ||
1569 | spage->have_csum = 0; | ||
1570 | } | ||
1571 | sblock->page_count++; | ||
1572 | len -= l; | ||
1573 | logical += l; | ||
1574 | physical += l; | ||
1575 | } | ||
1576 | |||
1577 | BUG_ON(sblock->page_count == 0); | ||
1578 | for (index = 0; index < sblock->page_count; index++) { | ||
1579 | struct scrub_page *spage = sblock->pagev + index; | ||
1580 | int ret; | ||
1581 | |||
1582 | ret = scrub_add_page_to_bio(sdev, spage); | ||
1583 | if (ret) { | ||
1584 | scrub_block_put(sblock); | ||
1585 | return ret; | ||
1586 | } | ||
1030 | } | 1587 | } |
1031 | ++sbio->count; | 1588 | |
1032 | if (sbio->count == SCRUB_PAGES_PER_BIO || force) | 1589 | if (force) |
1033 | scrub_submit(sdev); | 1590 | scrub_submit(sdev); |
1034 | 1591 | ||
1592 | /* last one frees, either here or in bio completion for last page */ | ||
1593 | scrub_block_put(sblock); | ||
1035 | return 0; | 1594 | return 0; |
1036 | } | 1595 | } |
1037 | 1596 | ||
1597 | static void scrub_bio_end_io(struct bio *bio, int err) | ||
1598 | { | ||
1599 | struct scrub_bio *sbio = bio->bi_private; | ||
1600 | struct scrub_dev *sdev = sbio->sdev; | ||
1601 | struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; | ||
1602 | |||
1603 | sbio->err = err; | ||
1604 | sbio->bio = bio; | ||
1605 | |||
1606 | btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work); | ||
1607 | } | ||
1608 | |||
1609 | static void scrub_bio_end_io_worker(struct btrfs_work *work) | ||
1610 | { | ||
1611 | struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); | ||
1612 | struct scrub_dev *sdev = sbio->sdev; | ||
1613 | int i; | ||
1614 | |||
1615 | BUG_ON(sbio->page_count > SCRUB_PAGES_PER_BIO); | ||
1616 | if (sbio->err) { | ||
1617 | for (i = 0; i < sbio->page_count; i++) { | ||
1618 | struct scrub_page *spage = sbio->pagev[i]; | ||
1619 | |||
1620 | spage->io_error = 1; | ||
1621 | spage->sblock->no_io_error_seen = 0; | ||
1622 | } | ||
1623 | } | ||
1624 | |||
1625 | /* now complete the scrub_block items that have all pages completed */ | ||
1626 | for (i = 0; i < sbio->page_count; i++) { | ||
1627 | struct scrub_page *spage = sbio->pagev[i]; | ||
1628 | struct scrub_block *sblock = spage->sblock; | ||
1629 | |||
1630 | if (atomic_dec_and_test(&sblock->outstanding_pages)) | ||
1631 | scrub_block_complete(sblock); | ||
1632 | scrub_block_put(sblock); | ||
1633 | } | ||
1634 | |||
1635 | if (sbio->err) { | ||
1636 | /* what is this good for??? */ | ||
1637 | sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1); | ||
1638 | sbio->bio->bi_flags |= 1 << BIO_UPTODATE; | ||
1639 | sbio->bio->bi_phys_segments = 0; | ||
1640 | sbio->bio->bi_idx = 0; | ||
1641 | |||
1642 | for (i = 0; i < sbio->page_count; i++) { | ||
1643 | struct bio_vec *bi; | ||
1644 | bi = &sbio->bio->bi_io_vec[i]; | ||
1645 | bi->bv_offset = 0; | ||
1646 | bi->bv_len = PAGE_SIZE; | ||
1647 | } | ||
1648 | } | ||
1649 | |||
1650 | bio_put(sbio->bio); | ||
1651 | sbio->bio = NULL; | ||
1652 | spin_lock(&sdev->list_lock); | ||
1653 | sbio->next_free = sdev->first_free; | ||
1654 | sdev->first_free = sbio->index; | ||
1655 | spin_unlock(&sdev->list_lock); | ||
1656 | atomic_dec(&sdev->in_flight); | ||
1657 | wake_up(&sdev->list_wait); | ||
1658 | } | ||
1659 | |||
1660 | static void scrub_block_complete(struct scrub_block *sblock) | ||
1661 | { | ||
1662 | if (!sblock->no_io_error_seen) | ||
1663 | scrub_handle_errored_block(sblock); | ||
1664 | else | ||
1665 | scrub_checksum(sblock); | ||
1666 | } | ||
1667 | |||
1038 | static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, | 1668 | static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, |
1039 | u8 *csum) | 1669 | u8 *csum) |
1040 | { | 1670 | { |
@@ -1042,7 +1672,6 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, | |||
1042 | int ret = 0; | 1672 | int ret = 0; |
1043 | unsigned long i; | 1673 | unsigned long i; |
1044 | unsigned long num_sectors; | 1674 | unsigned long num_sectors; |
1045 | u32 sectorsize = sdev->dev->dev_root->sectorsize; | ||
1046 | 1675 | ||
1047 | while (!list_empty(&sdev->csum_list)) { | 1676 | while (!list_empty(&sdev->csum_list)) { |
1048 | sum = list_first_entry(&sdev->csum_list, | 1677 | sum = list_first_entry(&sdev->csum_list, |
@@ -1060,7 +1689,7 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, | |||
1060 | if (!sum) | 1689 | if (!sum) |
1061 | return 0; | 1690 | return 0; |
1062 | 1691 | ||
1063 | num_sectors = sum->len / sectorsize; | 1692 | num_sectors = sum->len / sdev->sectorsize; |
1064 | for (i = 0; i < num_sectors; ++i) { | 1693 | for (i = 0; i < num_sectors; ++i) { |
1065 | if (sum->sums[i].bytenr == logical) { | 1694 | if (sum->sums[i].bytenr == logical) { |
1066 | memcpy(csum, &sum->sums[i].sum, sdev->csum_size); | 1695 | memcpy(csum, &sum->sums[i].sum, sdev->csum_size); |
@@ -1081,9 +1710,28 @@ static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, | |||
1081 | { | 1710 | { |
1082 | int ret; | 1711 | int ret; |
1083 | u8 csum[BTRFS_CSUM_SIZE]; | 1712 | u8 csum[BTRFS_CSUM_SIZE]; |
1713 | u32 blocksize; | ||
1714 | |||
1715 | if (flags & BTRFS_EXTENT_FLAG_DATA) { | ||
1716 | blocksize = sdev->sectorsize; | ||
1717 | spin_lock(&sdev->stat_lock); | ||
1718 | sdev->stat.data_extents_scrubbed++; | ||
1719 | sdev->stat.data_bytes_scrubbed += len; | ||
1720 | spin_unlock(&sdev->stat_lock); | ||
1721 | } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { | ||
1722 | BUG_ON(sdev->nodesize != sdev->leafsize); | ||
1723 | blocksize = sdev->nodesize; | ||
1724 | spin_lock(&sdev->stat_lock); | ||
1725 | sdev->stat.tree_extents_scrubbed++; | ||
1726 | sdev->stat.tree_bytes_scrubbed += len; | ||
1727 | spin_unlock(&sdev->stat_lock); | ||
1728 | } else { | ||
1729 | blocksize = sdev->sectorsize; | ||
1730 | BUG_ON(1); | ||
1731 | } | ||
1084 | 1732 | ||
1085 | while (len) { | 1733 | while (len) { |
1086 | u64 l = min_t(u64, len, PAGE_SIZE); | 1734 | u64 l = min_t(u64, len, blocksize); |
1087 | int have_csum = 0; | 1735 | int have_csum = 0; |
1088 | 1736 | ||
1089 | if (flags & BTRFS_EXTENT_FLAG_DATA) { | 1737 | if (flags & BTRFS_EXTENT_FLAG_DATA) { |
@@ -1092,8 +1740,8 @@ static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, | |||
1092 | if (have_csum == 0) | 1740 | if (have_csum == 0) |
1093 | ++sdev->stat.no_csum; | 1741 | ++sdev->stat.no_csum; |
1094 | } | 1742 | } |
1095 | ret = scrub_page(sdev, logical, l, physical, flags, gen, | 1743 | ret = scrub_pages(sdev, logical, l, physical, flags, gen, |
1096 | mirror_num, have_csum ? csum : NULL, 0); | 1744 | mirror_num, have_csum ? csum : NULL, 0); |
1097 | if (ret) | 1745 | if (ret) |
1098 | return ret; | 1746 | return ret; |
1099 | len -= l; | 1747 | len -= l; |
@@ -1158,6 +1806,11 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, | |||
1158 | if (!path) | 1806 | if (!path) |
1159 | return -ENOMEM; | 1807 | return -ENOMEM; |
1160 | 1808 | ||
1809 | /* | ||
1810 | * work on commit root. The related disk blocks are static as | ||
1811 | * long as COW is applied. This means, it is save to rewrite | ||
1812 | * them to repair disk errors without any race conditions | ||
1813 | */ | ||
1161 | path->search_commit_root = 1; | 1814 | path->search_commit_root = 1; |
1162 | path->skip_locking = 1; | 1815 | path->skip_locking = 1; |
1163 | 1816 | ||
@@ -1511,8 +2164,8 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev) | |||
1511 | if (bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes) | 2164 | if (bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes) |
1512 | break; | 2165 | break; |
1513 | 2166 | ||
1514 | ret = scrub_page(sdev, bytenr, PAGE_SIZE, bytenr, | 2167 | ret = scrub_pages(sdev, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, |
1515 | BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1); | 2168 | BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1); |
1516 | if (ret) | 2169 | if (ret) |
1517 | return ret; | 2170 | return ret; |
1518 | } | 2171 | } |
@@ -1571,10 +2224,30 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, | |||
1571 | /* | 2224 | /* |
1572 | * check some assumptions | 2225 | * check some assumptions |
1573 | */ | 2226 | */ |
1574 | if (root->sectorsize != PAGE_SIZE || | 2227 | if (root->nodesize != root->leafsize) { |
1575 | root->sectorsize != root->leafsize || | 2228 | printk(KERN_ERR |
1576 | root->sectorsize != root->nodesize) { | 2229 | "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n", |
1577 | printk(KERN_ERR "btrfs_scrub: size assumptions fail\n"); | 2230 | root->nodesize, root->leafsize); |
2231 | return -EINVAL; | ||
2232 | } | ||
2233 | |||
2234 | if (root->nodesize > BTRFS_STRIPE_LEN) { | ||
2235 | /* | ||
2236 | * in this case scrub is unable to calculate the checksum | ||
2237 | * the way scrub is implemented. Do not handle this | ||
2238 | * situation at all because it won't ever happen. | ||
2239 | */ | ||
2240 | printk(KERN_ERR | ||
2241 | "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n", | ||
2242 | root->nodesize, BTRFS_STRIPE_LEN); | ||
2243 | return -EINVAL; | ||
2244 | } | ||
2245 | |||
2246 | if (root->sectorsize != PAGE_SIZE) { | ||
2247 | /* not supported for data w/o checksums */ | ||
2248 | printk(KERN_ERR | ||
2249 | "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n", | ||
2250 | root->sectorsize, (unsigned long long)PAGE_SIZE); | ||
1578 | return -EINVAL; | 2251 | return -EINVAL; |
1579 | } | 2252 | } |
1580 | 2253 | ||