aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/scrub.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-03-30 15:44:29 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-03-30 15:44:29 -0400
commit9613bebb223dea3179c265dc31e1bb41ae39f321 (patch)
tree39bf883573d23775a53be3172323c0237fef5630 /fs/btrfs/scrub.c
parent40380f1c7841a5dcbf0b20f0b6da11969211ef77 (diff)
parentbc3f116fec194f1d7329b160c266fe16b9266a1e (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs
Pull btrfs fixes and features from Chris Mason: "We've merged in the error handling patches from SuSE. These are already shipping in the sles kernel, and they give btrfs the ability to abort transactions and go readonly on errors. It involves a lot of churn as they clarify BUG_ONs, and remove the ones we now properly deal with. Josef reworked the way our metadata interacts with the page cache. page->private now points to the btrfs extent_buffer object, which makes everything faster. He changed it so we write an whole extent buffer at a time instead of allowing individual pages to go down,, which will be important for the raid5/6 code (for the 3.5 merge window ;) Josef also made us more aggressive about dropping pages for metadata blocks that were freed due to COW. Overall, our metadata caching is much faster now. We've integrated my patch for metadata bigger than the page size. This allows metadata blocks up to 64KB in size. In practice 16K and 32K seem to work best. For workloads with lots of metadata, this cuts down the size of the extent allocation tree dramatically and fragments much less. Scrub was updated to support the larger block sizes, which ended up being a fairly large change (thanks Stefan Behrens). We also have an assortment of fixes and updates, especially to the balancing code (Ilya Dryomov), the back ref walker (Jan Schmidt) and the defragging code (Liu Bo)." Fixed up trivial conflicts in fs/btrfs/scrub.c that were just due to removal of the second argument to k[un]map_atomic() in commit 7ac687d9e047. * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: (75 commits) Btrfs: update the checks for mixed block groups with big metadata blocks Btrfs: update to the right index of defragment Btrfs: do not bother to defrag an extent if it is a big real extent Btrfs: add a check to decide if we should defrag the range Btrfs: fix recursive defragment with autodefrag option Btrfs: fix the mismatch of page->mapping Btrfs: fix race between direct io and autodefrag Btrfs: fix deadlock during allocating chunks Btrfs: show useful info in space reservation tracepoint Btrfs: don't use crc items bigger than 4KB Btrfs: flush out and clean up any block device pages during mount btrfs: disallow unequal data/metadata blocksize for mixed block groups Btrfs: enhance superblock sanity checks Btrfs: change scrub to support big blocks Btrfs: minor cleanup in scrub Btrfs: introduce common define for max number of mirrors Btrfs: fix infinite loop in btrfs_shrink_device() Btrfs: fix memory leak in resolver code Btrfs: allow dup for data chunks in mixed mode Btrfs: validate target profiles only if we are going to use them ...
Diffstat (limited to 'fs/btrfs/scrub.c')
-rw-r--r--fs/btrfs/scrub.c1407
1 files changed, 1035 insertions, 372 deletions
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 390e7102b0ff..90acc82046c3 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -36,37 +36,30 @@
36 * Future enhancements: 36 * Future enhancements:
37 * - In case an unrepairable extent is encountered, track which files are 37 * - In case an unrepairable extent is encountered, track which files are
38 * affected and report them 38 * affected and report them
39 * - In case of a read error on files with nodatasum, map the file and read
40 * the extent to trigger a writeback of the good copy
41 * - track and record media errors, throw out bad devices 39 * - track and record media errors, throw out bad devices
42 * - add a mode to also read unallocated space 40 * - add a mode to also read unallocated space
43 */ 41 */
44 42
45struct scrub_bio; 43struct scrub_block;
46struct scrub_page;
47struct scrub_dev; 44struct scrub_dev;
48static void scrub_bio_end_io(struct bio *bio, int err);
49static void scrub_checksum(struct btrfs_work *work);
50static int scrub_checksum_data(struct scrub_dev *sdev,
51 struct scrub_page *spag, void *buffer);
52static int scrub_checksum_tree_block(struct scrub_dev *sdev,
53 struct scrub_page *spag, u64 logical,
54 void *buffer);
55static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer);
56static int scrub_fixup_check(struct scrub_bio *sbio, int ix);
57static void scrub_fixup_end_io(struct bio *bio, int err);
58static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
59 struct page *page);
60static void scrub_fixup(struct scrub_bio *sbio, int ix);
61 45
62#define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */ 46#define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */
63#define SCRUB_BIOS_PER_DEV 16 /* 1 MB per device in flight */ 47#define SCRUB_BIOS_PER_DEV 16 /* 1 MB per device in flight */
48#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */
64 49
65struct scrub_page { 50struct scrub_page {
51 struct scrub_block *sblock;
52 struct page *page;
53 struct block_device *bdev;
66 u64 flags; /* extent flags */ 54 u64 flags; /* extent flags */
67 u64 generation; 55 u64 generation;
68 int mirror_num; 56 u64 logical;
69 int have_csum; 57 u64 physical;
58 struct {
59 unsigned int mirror_num:8;
60 unsigned int have_csum:1;
61 unsigned int io_error:1;
62 };
70 u8 csum[BTRFS_CSUM_SIZE]; 63 u8 csum[BTRFS_CSUM_SIZE];
71}; 64};
72 65
@@ -77,12 +70,25 @@ struct scrub_bio {
77 int err; 70 int err;
78 u64 logical; 71 u64 logical;
79 u64 physical; 72 u64 physical;
80 struct scrub_page spag[SCRUB_PAGES_PER_BIO]; 73 struct scrub_page *pagev[SCRUB_PAGES_PER_BIO];
81 u64 count; 74 int page_count;
82 int next_free; 75 int next_free;
83 struct btrfs_work work; 76 struct btrfs_work work;
84}; 77};
85 78
79struct scrub_block {
80 struct scrub_page pagev[SCRUB_MAX_PAGES_PER_BLOCK];
81 int page_count;
82 atomic_t outstanding_pages;
83 atomic_t ref_count; /* free mem on transition to zero */
84 struct scrub_dev *sdev;
85 struct {
86 unsigned int header_error:1;
87 unsigned int checksum_error:1;
88 unsigned int no_io_error_seen:1;
89 };
90};
91
86struct scrub_dev { 92struct scrub_dev {
87 struct scrub_bio *bios[SCRUB_BIOS_PER_DEV]; 93 struct scrub_bio *bios[SCRUB_BIOS_PER_DEV];
88 struct btrfs_device *dev; 94 struct btrfs_device *dev;
@@ -96,6 +102,10 @@ struct scrub_dev {
96 struct list_head csum_list; 102 struct list_head csum_list;
97 atomic_t cancel_req; 103 atomic_t cancel_req;
98 int readonly; 104 int readonly;
105 int pages_per_bio; /* <= SCRUB_PAGES_PER_BIO */
106 u32 sectorsize;
107 u32 nodesize;
108 u32 leafsize;
99 /* 109 /*
100 * statistics 110 * statistics
101 */ 111 */
@@ -124,6 +134,43 @@ struct scrub_warning {
124 int scratch_bufsize; 134 int scratch_bufsize;
125}; 135};
126 136
137
138static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
139static int scrub_setup_recheck_block(struct scrub_dev *sdev,
140 struct btrfs_mapping_tree *map_tree,
141 u64 length, u64 logical,
142 struct scrub_block *sblock);
143static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
144 struct scrub_block *sblock, int is_metadata,
145 int have_csum, u8 *csum, u64 generation,
146 u16 csum_size);
147static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
148 struct scrub_block *sblock,
149 int is_metadata, int have_csum,
150 const u8 *csum, u64 generation,
151 u16 csum_size);
152static void scrub_complete_bio_end_io(struct bio *bio, int err);
153static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
154 struct scrub_block *sblock_good,
155 int force_write);
156static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
157 struct scrub_block *sblock_good,
158 int page_num, int force_write);
159static int scrub_checksum_data(struct scrub_block *sblock);
160static int scrub_checksum_tree_block(struct scrub_block *sblock);
161static int scrub_checksum_super(struct scrub_block *sblock);
162static void scrub_block_get(struct scrub_block *sblock);
163static void scrub_block_put(struct scrub_block *sblock);
164static int scrub_add_page_to_bio(struct scrub_dev *sdev,
165 struct scrub_page *spage);
166static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
167 u64 physical, u64 flags, u64 gen, int mirror_num,
168 u8 *csum, int force);
169static void scrub_bio_end_io(struct bio *bio, int err);
170static void scrub_bio_end_io_worker(struct btrfs_work *work);
171static void scrub_block_complete(struct scrub_block *sblock);
172
173
127static void scrub_free_csums(struct scrub_dev *sdev) 174static void scrub_free_csums(struct scrub_dev *sdev)
128{ 175{
129 while (!list_empty(&sdev->csum_list)) { 176 while (!list_empty(&sdev->csum_list)) {
@@ -135,23 +182,6 @@ static void scrub_free_csums(struct scrub_dev *sdev)
135 } 182 }
136} 183}
137 184
138static void scrub_free_bio(struct bio *bio)
139{
140 int i;
141 struct page *last_page = NULL;
142
143 if (!bio)
144 return;
145
146 for (i = 0; i < bio->bi_vcnt; ++i) {
147 if (bio->bi_io_vec[i].bv_page == last_page)
148 continue;
149 last_page = bio->bi_io_vec[i].bv_page;
150 __free_page(last_page);
151 }
152 bio_put(bio);
153}
154
155static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev) 185static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev)
156{ 186{
157 int i; 187 int i;
@@ -159,13 +189,23 @@ static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev)
159 if (!sdev) 189 if (!sdev)
160 return; 190 return;
161 191
192 /* this can happen when scrub is cancelled */
193 if (sdev->curr != -1) {
194 struct scrub_bio *sbio = sdev->bios[sdev->curr];
195
196 for (i = 0; i < sbio->page_count; i++) {
197 BUG_ON(!sbio->pagev[i]);
198 BUG_ON(!sbio->pagev[i]->page);
199 scrub_block_put(sbio->pagev[i]->sblock);
200 }
201 bio_put(sbio->bio);
202 }
203
162 for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { 204 for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
163 struct scrub_bio *sbio = sdev->bios[i]; 205 struct scrub_bio *sbio = sdev->bios[i];
164 206
165 if (!sbio) 207 if (!sbio)
166 break; 208 break;
167
168 scrub_free_bio(sbio->bio);
169 kfree(sbio); 209 kfree(sbio);
170 } 210 }
171 211
@@ -179,11 +219,16 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
179 struct scrub_dev *sdev; 219 struct scrub_dev *sdev;
180 int i; 220 int i;
181 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; 221 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
222 int pages_per_bio;
182 223
224 pages_per_bio = min_t(int, SCRUB_PAGES_PER_BIO,
225 bio_get_nr_vecs(dev->bdev));
183 sdev = kzalloc(sizeof(*sdev), GFP_NOFS); 226 sdev = kzalloc(sizeof(*sdev), GFP_NOFS);
184 if (!sdev) 227 if (!sdev)
185 goto nomem; 228 goto nomem;
186 sdev->dev = dev; 229 sdev->dev = dev;
230 sdev->pages_per_bio = pages_per_bio;
231 sdev->curr = -1;
187 for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { 232 for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
188 struct scrub_bio *sbio; 233 struct scrub_bio *sbio;
189 234
@@ -194,8 +239,8 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
194 239
195 sbio->index = i; 240 sbio->index = i;
196 sbio->sdev = sdev; 241 sbio->sdev = sdev;
197 sbio->count = 0; 242 sbio->page_count = 0;
198 sbio->work.func = scrub_checksum; 243 sbio->work.func = scrub_bio_end_io_worker;
199 244
200 if (i != SCRUB_BIOS_PER_DEV-1) 245 if (i != SCRUB_BIOS_PER_DEV-1)
201 sdev->bios[i]->next_free = i + 1; 246 sdev->bios[i]->next_free = i + 1;
@@ -203,7 +248,9 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
203 sdev->bios[i]->next_free = -1; 248 sdev->bios[i]->next_free = -1;
204 } 249 }
205 sdev->first_free = 0; 250 sdev->first_free = 0;
206 sdev->curr = -1; 251 sdev->nodesize = dev->dev_root->nodesize;
252 sdev->leafsize = dev->dev_root->leafsize;
253 sdev->sectorsize = dev->dev_root->sectorsize;
207 atomic_set(&sdev->in_flight, 0); 254 atomic_set(&sdev->in_flight, 0);
208 atomic_set(&sdev->fixup_cnt, 0); 255 atomic_set(&sdev->fixup_cnt, 0);
209 atomic_set(&sdev->cancel_req, 0); 256 atomic_set(&sdev->cancel_req, 0);
@@ -294,10 +341,9 @@ err:
294 return 0; 341 return 0;
295} 342}
296 343
297static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio, 344static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
298 int ix)
299{ 345{
300 struct btrfs_device *dev = sbio->sdev->dev; 346 struct btrfs_device *dev = sblock->sdev->dev;
301 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; 347 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
302 struct btrfs_path *path; 348 struct btrfs_path *path;
303 struct btrfs_key found_key; 349 struct btrfs_key found_key;
@@ -316,8 +362,9 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
316 362
317 swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS); 363 swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
318 swarn.msg_buf = kmalloc(bufsize, GFP_NOFS); 364 swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
319 swarn.sector = (sbio->physical + ix * PAGE_SIZE) >> 9; 365 BUG_ON(sblock->page_count < 1);
320 swarn.logical = sbio->logical + ix * PAGE_SIZE; 366 swarn.sector = (sblock->pagev[0].physical) >> 9;
367 swarn.logical = sblock->pagev[0].logical;
321 swarn.errstr = errstr; 368 swarn.errstr = errstr;
322 swarn.dev = dev; 369 swarn.dev = dev;
323 swarn.msg_bufsize = bufsize; 370 swarn.msg_bufsize = bufsize;
@@ -342,7 +389,8 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
342 do { 389 do {
343 ret = tree_backref_for_extent(&ptr, eb, ei, item_size, 390 ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
344 &ref_root, &ref_level); 391 &ref_root, &ref_level);
345 printk(KERN_WARNING "%s at logical %llu on dev %s, " 392 printk(KERN_WARNING
393 "btrfs: %s at logical %llu on dev %s, "
346 "sector %llu: metadata %s (level %d) in tree " 394 "sector %llu: metadata %s (level %d) in tree "
347 "%llu\n", errstr, swarn.logical, dev->name, 395 "%llu\n", errstr, swarn.logical, dev->name,
348 (unsigned long long)swarn.sector, 396 (unsigned long long)swarn.sector,
@@ -352,8 +400,8 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
352 } while (ret != 1); 400 } while (ret != 1);
353 } else { 401 } else {
354 swarn.path = path; 402 swarn.path = path;
355 iterate_extent_inodes(fs_info, path, found_key.objectid, 403 iterate_extent_inodes(fs_info, found_key.objectid,
356 extent_item_pos, 404 extent_item_pos, 1,
357 scrub_print_warning_inode, &swarn); 405 scrub_print_warning_inode, &swarn);
358 } 406 }
359 407
@@ -531,9 +579,9 @@ out:
531 spin_lock(&sdev->stat_lock); 579 spin_lock(&sdev->stat_lock);
532 ++sdev->stat.uncorrectable_errors; 580 ++sdev->stat.uncorrectable_errors;
533 spin_unlock(&sdev->stat_lock); 581 spin_unlock(&sdev->stat_lock);
534 printk_ratelimited(KERN_ERR "btrfs: unable to fixup " 582 printk_ratelimited(KERN_ERR
535 "(nodatasum) error at logical %llu\n", 583 "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n",
536 fixup->logical); 584 (unsigned long long)fixup->logical, sdev->dev->name);
537 } 585 }
538 586
539 btrfs_free_path(path); 587 btrfs_free_path(path);
@@ -550,91 +598,168 @@ out:
550} 598}
551 599
552/* 600/*
553 * scrub_recheck_error gets called when either verification of the page 601 * scrub_handle_errored_block gets called when either verification of the
554 * failed or the bio failed to read, e.g. with EIO. In the latter case, 602 * pages failed or the bio failed to read, e.g. with EIO. In the latter
555 * recheck_error gets called for every page in the bio, even though only 603 * case, this function handles all pages in the bio, even though only one
556 * one may be bad 604 * may be bad.
605 * The goal of this function is to repair the errored block by using the
606 * contents of one of the mirrors.
557 */ 607 */
558static int scrub_recheck_error(struct scrub_bio *sbio, int ix) 608static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
559{ 609{
560 struct scrub_dev *sdev = sbio->sdev; 610 struct scrub_dev *sdev = sblock_to_check->sdev;
561 u64 sector = (sbio->physical + ix * PAGE_SIZE) >> 9; 611 struct btrfs_fs_info *fs_info;
612 u64 length;
613 u64 logical;
614 u64 generation;
615 unsigned int failed_mirror_index;
616 unsigned int is_metadata;
617 unsigned int have_csum;
618 u8 *csum;
619 struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
620 struct scrub_block *sblock_bad;
621 int ret;
622 int mirror_index;
623 int page_num;
624 int success;
562 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, 625 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
563 DEFAULT_RATELIMIT_BURST); 626 DEFAULT_RATELIMIT_BURST);
627
628 BUG_ON(sblock_to_check->page_count < 1);
629 fs_info = sdev->dev->dev_root->fs_info;
630 length = sblock_to_check->page_count * PAGE_SIZE;
631 logical = sblock_to_check->pagev[0].logical;
632 generation = sblock_to_check->pagev[0].generation;
633 BUG_ON(sblock_to_check->pagev[0].mirror_num < 1);
634 failed_mirror_index = sblock_to_check->pagev[0].mirror_num - 1;
635 is_metadata = !(sblock_to_check->pagev[0].flags &
636 BTRFS_EXTENT_FLAG_DATA);
637 have_csum = sblock_to_check->pagev[0].have_csum;
638 csum = sblock_to_check->pagev[0].csum;
564 639
565 if (sbio->err) { 640 /*
566 if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector, 641 * read all mirrors one after the other. This includes to
567 sbio->bio->bi_io_vec[ix].bv_page) == 0) { 642 * re-read the extent or metadata block that failed (that was
568 if (scrub_fixup_check(sbio, ix) == 0) 643 * the cause that this fixup code is called) another time,
569 return 0; 644 * page by page this time in order to know which pages
570 } 645 * caused I/O errors and which ones are good (for all mirrors).
571 if (__ratelimit(&_rs)) 646 * It is the goal to handle the situation when more than one
572 scrub_print_warning("i/o error", sbio, ix); 647 * mirror contains I/O errors, but the errors do not
573 } else { 648 * overlap, i.e. the data can be repaired by selecting the
574 if (__ratelimit(&_rs)) 649 * pages from those mirrors without I/O error on the
575 scrub_print_warning("checksum error", sbio, ix); 650 * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
651 * would be that mirror #1 has an I/O error on the first page,
652 * the second page is good, and mirror #2 has an I/O error on
653 * the second page, but the first page is good.
654 * Then the first page of the first mirror can be repaired by
655 * taking the first page of the second mirror, and the
656 * second page of the second mirror can be repaired by
657 * copying the contents of the 2nd page of the 1st mirror.
658 * One more note: if the pages of one mirror contain I/O
659 * errors, the checksum cannot be verified. In order to get
660 * the best data for repairing, the first attempt is to find
661 * a mirror without I/O errors and with a validated checksum.
662 * Only if this is not possible, the pages are picked from
663 * mirrors with I/O errors without considering the checksum.
664 * If the latter is the case, at the end, the checksum of the
665 * repaired area is verified in order to correctly maintain
666 * the statistics.
667 */
668
669 sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS *
670 sizeof(*sblocks_for_recheck),
671 GFP_NOFS);
672 if (!sblocks_for_recheck) {
673 spin_lock(&sdev->stat_lock);
674 sdev->stat.malloc_errors++;
675 sdev->stat.read_errors++;
676 sdev->stat.uncorrectable_errors++;
677 spin_unlock(&sdev->stat_lock);
678 goto out;
576 } 679 }
577 680
578 spin_lock(&sdev->stat_lock); 681 /* setup the context, map the logical blocks and alloc the pages */
579 ++sdev->stat.read_errors; 682 ret = scrub_setup_recheck_block(sdev, &fs_info->mapping_tree, length,
580 spin_unlock(&sdev->stat_lock); 683 logical, sblocks_for_recheck);
684 if (ret) {
685 spin_lock(&sdev->stat_lock);
686 sdev->stat.read_errors++;
687 sdev->stat.uncorrectable_errors++;
688 spin_unlock(&sdev->stat_lock);
689 goto out;
690 }
691 BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
692 sblock_bad = sblocks_for_recheck + failed_mirror_index;
581 693
582 scrub_fixup(sbio, ix); 694 /* build and submit the bios for the failed mirror, check checksums */
583 return 1; 695 ret = scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
584} 696 csum, generation, sdev->csum_size);
697 if (ret) {
698 spin_lock(&sdev->stat_lock);
699 sdev->stat.read_errors++;
700 sdev->stat.uncorrectable_errors++;
701 spin_unlock(&sdev->stat_lock);
702 goto out;
703 }
585 704
586static int scrub_fixup_check(struct scrub_bio *sbio, int ix) 705 if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
587{ 706 sblock_bad->no_io_error_seen) {
588 int ret = 1; 707 /*
589 struct page *page; 708 * the error disappeared after reading page by page, or
590 void *buffer; 709 * the area was part of a huge bio and other parts of the
591 u64 flags = sbio->spag[ix].flags; 710 * bio caused I/O errors, or the block layer merged several
711 * read requests into one and the error is caused by a
712 * different bio (usually one of the two latter cases is
713 * the cause)
714 */
715 spin_lock(&sdev->stat_lock);
716 sdev->stat.unverified_errors++;
717 spin_unlock(&sdev->stat_lock);
592 718
593 page = sbio->bio->bi_io_vec[ix].bv_page; 719 goto out;
594 buffer = kmap_atomic(page);
595 if (flags & BTRFS_EXTENT_FLAG_DATA) {
596 ret = scrub_checksum_data(sbio->sdev,
597 sbio->spag + ix, buffer);
598 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
599 ret = scrub_checksum_tree_block(sbio->sdev,
600 sbio->spag + ix,
601 sbio->logical + ix * PAGE_SIZE,
602 buffer);
603 } else {
604 WARN_ON(1);
605 } 720 }
606 kunmap_atomic(buffer);
607 721
608 return ret; 722 if (!sblock_bad->no_io_error_seen) {
609} 723 spin_lock(&sdev->stat_lock);
724 sdev->stat.read_errors++;
725 spin_unlock(&sdev->stat_lock);
726 if (__ratelimit(&_rs))
727 scrub_print_warning("i/o error", sblock_to_check);
728 } else if (sblock_bad->checksum_error) {
729 spin_lock(&sdev->stat_lock);
730 sdev->stat.csum_errors++;
731 spin_unlock(&sdev->stat_lock);
732 if (__ratelimit(&_rs))
733 scrub_print_warning("checksum error", sblock_to_check);
734 } else if (sblock_bad->header_error) {
735 spin_lock(&sdev->stat_lock);
736 sdev->stat.verify_errors++;
737 spin_unlock(&sdev->stat_lock);
738 if (__ratelimit(&_rs))
739 scrub_print_warning("checksum/header error",
740 sblock_to_check);
741 }
610 742
611static void scrub_fixup_end_io(struct bio *bio, int err) 743 if (sdev->readonly)
612{ 744 goto did_not_correct_error;
613 complete((struct completion *)bio->bi_private);
614}
615 745
616static void scrub_fixup(struct scrub_bio *sbio, int ix) 746 if (!is_metadata && !have_csum) {
617{ 747 struct scrub_fixup_nodatasum *fixup_nodatasum;
618 struct scrub_dev *sdev = sbio->sdev; 748
619 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; 749 /*
620 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 750 * !is_metadata and !have_csum, this means that the data
621 struct btrfs_bio *bbio = NULL; 751 * might not be COW'ed, that it might be modified
622 struct scrub_fixup_nodatasum *fixup; 752 * concurrently. The general strategy to work on the
623 u64 logical = sbio->logical + ix * PAGE_SIZE; 753 * commit root does not help in the case when COW is not
624 u64 length; 754 * used.
625 int i; 755 */
626 int ret; 756 fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
627 DECLARE_COMPLETION_ONSTACK(complete); 757 if (!fixup_nodatasum)
628 758 goto did_not_correct_error;
629 if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) && 759 fixup_nodatasum->sdev = sdev;
630 (sbio->spag[ix].have_csum == 0)) { 760 fixup_nodatasum->logical = logical;
631 fixup = kzalloc(sizeof(*fixup), GFP_NOFS); 761 fixup_nodatasum->root = fs_info->extent_root;
632 if (!fixup) 762 fixup_nodatasum->mirror_num = failed_mirror_index + 1;
633 goto uncorrectable;
634 fixup->sdev = sdev;
635 fixup->logical = logical;
636 fixup->root = fs_info->extent_root;
637 fixup->mirror_num = sbio->spag[ix].mirror_num;
638 /* 763 /*
639 * increment scrubs_running to prevent cancel requests from 764 * increment scrubs_running to prevent cancel requests from
640 * completing as long as a fixup worker is running. we must also 765 * completing as long as a fixup worker is running. we must also
@@ -649,235 +774,528 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
649 atomic_inc(&fs_info->scrubs_paused); 774 atomic_inc(&fs_info->scrubs_paused);
650 mutex_unlock(&fs_info->scrub_lock); 775 mutex_unlock(&fs_info->scrub_lock);
651 atomic_inc(&sdev->fixup_cnt); 776 atomic_inc(&sdev->fixup_cnt);
652 fixup->work.func = scrub_fixup_nodatasum; 777 fixup_nodatasum->work.func = scrub_fixup_nodatasum;
653 btrfs_queue_worker(&fs_info->scrub_workers, &fixup->work); 778 btrfs_queue_worker(&fs_info->scrub_workers,
654 return; 779 &fixup_nodatasum->work);
780 goto out;
655 } 781 }
656 782
657 length = PAGE_SIZE; 783 /*
658 ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, 784 * now build and submit the bios for the other mirrors, check
659 &bbio, 0); 785 * checksums
660 if (ret || !bbio || length < PAGE_SIZE) { 786 */
661 printk(KERN_ERR 787 for (mirror_index = 0;
662 "scrub_fixup: btrfs_map_block failed us for %llu\n", 788 mirror_index < BTRFS_MAX_MIRRORS &&
663 (unsigned long long)logical); 789 sblocks_for_recheck[mirror_index].page_count > 0;
664 WARN_ON(1); 790 mirror_index++) {
665 kfree(bbio); 791 if (mirror_index == failed_mirror_index)
666 return; 792 continue;
793
794 /* build and submit the bios, check checksums */
795 ret = scrub_recheck_block(fs_info,
796 sblocks_for_recheck + mirror_index,
797 is_metadata, have_csum, csum,
798 generation, sdev->csum_size);
799 if (ret)
800 goto did_not_correct_error;
667 } 801 }
668 802
669 if (bbio->num_stripes == 1) 803 /*
670 /* there aren't any replicas */ 804 * first try to pick the mirror which is completely without I/O
671 goto uncorrectable; 805 * errors and also does not have a checksum error.
806 * If one is found, and if a checksum is present, the full block
807 * that is known to contain an error is rewritten. Afterwards
808 * the block is known to be corrected.
809 * If a mirror is found which is completely correct, and no
810 * checksum is present, only those pages are rewritten that had
811 * an I/O error in the block to be repaired, since it cannot be
812 * determined, which copy of the other pages is better (and it
813 * could happen otherwise that a correct page would be
814 * overwritten by a bad one).
815 */
816 for (mirror_index = 0;
817 mirror_index < BTRFS_MAX_MIRRORS &&
818 sblocks_for_recheck[mirror_index].page_count > 0;
819 mirror_index++) {
820 struct scrub_block *sblock_other = sblocks_for_recheck +
821 mirror_index;
822
823 if (!sblock_other->header_error &&
824 !sblock_other->checksum_error &&
825 sblock_other->no_io_error_seen) {
826 int force_write = is_metadata || have_csum;
827
828 ret = scrub_repair_block_from_good_copy(sblock_bad,
829 sblock_other,
830 force_write);
831 if (0 == ret)
832 goto corrected_error;
833 }
834 }
672 835
673 /* 836 /*
674 * first find a good copy 837 * in case of I/O errors in the area that is supposed to be
838 * repaired, continue by picking good copies of those pages.
839 * Select the good pages from mirrors to rewrite bad pages from
840 * the area to fix. Afterwards verify the checksum of the block
841 * that is supposed to be repaired. This verification step is
842 * only done for the purpose of statistic counting and for the
843 * final scrub report, whether errors remain.
844 * A perfect algorithm could make use of the checksum and try
845 * all possible combinations of pages from the different mirrors
846 * until the checksum verification succeeds. For example, when
847 * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
848 * of mirror #2 is readable but the final checksum test fails,
849 * then the 2nd page of mirror #3 could be tried, whether now
850 * the final checksum succeedes. But this would be a rare
851 * exception and is therefore not implemented. At least it is
852 * avoided that the good copy is overwritten.
853 * A more useful improvement would be to pick the sectors
854 * without I/O error based on sector sizes (512 bytes on legacy
855 * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
856 * mirror could be repaired by taking 512 byte of a different
857 * mirror, even if other 512 byte sectors in the same PAGE_SIZE
858 * area are unreadable.
675 */ 859 */
676 for (i = 0; i < bbio->num_stripes; ++i) {
677 if (i + 1 == sbio->spag[ix].mirror_num)
678 continue;
679 860
680 if (scrub_fixup_io(READ, bbio->stripes[i].dev->bdev, 861 /* can only fix I/O errors from here on */
681 bbio->stripes[i].physical >> 9, 862 if (sblock_bad->no_io_error_seen)
682 sbio->bio->bi_io_vec[ix].bv_page)) { 863 goto did_not_correct_error;
683 /* I/O-error, this is not a good copy */ 864
865 success = 1;
866 for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
867 struct scrub_page *page_bad = sblock_bad->pagev + page_num;
868
869 if (!page_bad->io_error)
684 continue; 870 continue;
871
872 for (mirror_index = 0;
873 mirror_index < BTRFS_MAX_MIRRORS &&
874 sblocks_for_recheck[mirror_index].page_count > 0;
875 mirror_index++) {
876 struct scrub_block *sblock_other = sblocks_for_recheck +
877 mirror_index;
878 struct scrub_page *page_other = sblock_other->pagev +
879 page_num;
880
881 if (!page_other->io_error) {
882 ret = scrub_repair_page_from_good_copy(
883 sblock_bad, sblock_other, page_num, 0);
884 if (0 == ret) {
885 page_bad->io_error = 0;
886 break; /* succeeded for this page */
887 }
888 }
685 } 889 }
686 890
687 if (scrub_fixup_check(sbio, ix) == 0) 891 if (page_bad->io_error) {
688 break; 892 /* did not find a mirror to copy the page from */
893 success = 0;
894 }
689 } 895 }
690 if (i == bbio->num_stripes)
691 goto uncorrectable;
692 896
693 if (!sdev->readonly) { 897 if (success) {
694 /* 898 if (is_metadata || have_csum) {
695 * bi_io_vec[ix].bv_page now contains good data, write it back 899 /*
696 */ 900 * need to verify the checksum now that all
697 if (scrub_fixup_io(WRITE, sdev->dev->bdev, 901 * sectors on disk are repaired (the write
698 (sbio->physical + ix * PAGE_SIZE) >> 9, 902 * request for data to be repaired is on its way).
699 sbio->bio->bi_io_vec[ix].bv_page)) { 903 * Just be lazy and use scrub_recheck_block()
700 /* I/O-error, writeback failed, give up */ 904 * which re-reads the data before the checksum
701 goto uncorrectable; 905 * is verified, but most likely the data comes out
906 * of the page cache.
907 */
908 ret = scrub_recheck_block(fs_info, sblock_bad,
909 is_metadata, have_csum, csum,
910 generation, sdev->csum_size);
911 if (!ret && !sblock_bad->header_error &&
912 !sblock_bad->checksum_error &&
913 sblock_bad->no_io_error_seen)
914 goto corrected_error;
915 else
916 goto did_not_correct_error;
917 } else {
918corrected_error:
919 spin_lock(&sdev->stat_lock);
920 sdev->stat.corrected_errors++;
921 spin_unlock(&sdev->stat_lock);
922 printk_ratelimited(KERN_ERR
923 "btrfs: fixed up error at logical %llu on dev %s\n",
924 (unsigned long long)logical, sdev->dev->name);
702 } 925 }
926 } else {
927did_not_correct_error:
928 spin_lock(&sdev->stat_lock);
929 sdev->stat.uncorrectable_errors++;
930 spin_unlock(&sdev->stat_lock);
931 printk_ratelimited(KERN_ERR
932 "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n",
933 (unsigned long long)logical, sdev->dev->name);
703 } 934 }
704 935
705 kfree(bbio); 936out:
706 spin_lock(&sdev->stat_lock); 937 if (sblocks_for_recheck) {
707 ++sdev->stat.corrected_errors; 938 for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
708 spin_unlock(&sdev->stat_lock); 939 mirror_index++) {
940 struct scrub_block *sblock = sblocks_for_recheck +
941 mirror_index;
942 int page_index;
943
944 for (page_index = 0; page_index < SCRUB_PAGES_PER_BIO;
945 page_index++)
946 if (sblock->pagev[page_index].page)
947 __free_page(
948 sblock->pagev[page_index].page);
949 }
950 kfree(sblocks_for_recheck);
951 }
709 952
710 printk_ratelimited(KERN_ERR "btrfs: fixed up error at logical %llu\n", 953 return 0;
711 (unsigned long long)logical); 954}
712 return;
713 955
714uncorrectable: 956static int scrub_setup_recheck_block(struct scrub_dev *sdev,
715 kfree(bbio); 957 struct btrfs_mapping_tree *map_tree,
716 spin_lock(&sdev->stat_lock); 958 u64 length, u64 logical,
717 ++sdev->stat.uncorrectable_errors; 959 struct scrub_block *sblocks_for_recheck)
718 spin_unlock(&sdev->stat_lock); 960{
961 int page_index;
962 int mirror_index;
963 int ret;
964
965 /*
966 * note: the three members sdev, ref_count and outstanding_pages
967 * are not used (and not set) in the blocks that are used for
968 * the recheck procedure
969 */
970
971 page_index = 0;
972 while (length > 0) {
973 u64 sublen = min_t(u64, length, PAGE_SIZE);
974 u64 mapped_length = sublen;
975 struct btrfs_bio *bbio = NULL;
976
977 /*
978 * with a length of PAGE_SIZE, each returned stripe
979 * represents one mirror
980 */
981 ret = btrfs_map_block(map_tree, WRITE, logical, &mapped_length,
982 &bbio, 0);
983 if (ret || !bbio || mapped_length < sublen) {
984 kfree(bbio);
985 return -EIO;
986 }
719 987
720 printk_ratelimited(KERN_ERR "btrfs: unable to fixup (regular) error at " 988 BUG_ON(page_index >= SCRUB_PAGES_PER_BIO);
721 "logical %llu\n", (unsigned long long)logical); 989 for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;
990 mirror_index++) {
991 struct scrub_block *sblock;
992 struct scrub_page *page;
993
994 if (mirror_index >= BTRFS_MAX_MIRRORS)
995 continue;
996
997 sblock = sblocks_for_recheck + mirror_index;
998 page = sblock->pagev + page_index;
999 page->logical = logical;
1000 page->physical = bbio->stripes[mirror_index].physical;
1001 page->bdev = bbio->stripes[mirror_index].dev->bdev;
1002 page->mirror_num = mirror_index + 1;
1003 page->page = alloc_page(GFP_NOFS);
1004 if (!page->page) {
1005 spin_lock(&sdev->stat_lock);
1006 sdev->stat.malloc_errors++;
1007 spin_unlock(&sdev->stat_lock);
1008 return -ENOMEM;
1009 }
1010 sblock->page_count++;
1011 }
1012 kfree(bbio);
1013 length -= sublen;
1014 logical += sublen;
1015 page_index++;
1016 }
1017
1018 return 0;
722} 1019}
723 1020
724static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, 1021/*
725 struct page *page) 1022 * this function will check the on disk data for checksum errors, header
1023 * errors and read I/O errors. If any I/O errors happen, the exact pages
1024 * which are errored are marked as being bad. The goal is to enable scrub
1025 * to take those pages that are not errored from all the mirrors so that
1026 * the pages that are errored in the just handled mirror can be repaired.
1027 */
1028static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
1029 struct scrub_block *sblock, int is_metadata,
1030 int have_csum, u8 *csum, u64 generation,
1031 u16 csum_size)
726{ 1032{
727 struct bio *bio = NULL; 1033 int page_num;
728 int ret; 1034
729 DECLARE_COMPLETION_ONSTACK(complete); 1035 sblock->no_io_error_seen = 1;
1036 sblock->header_error = 0;
1037 sblock->checksum_error = 0;
1038
1039 for (page_num = 0; page_num < sblock->page_count; page_num++) {
1040 struct bio *bio;
1041 int ret;
1042 struct scrub_page *page = sblock->pagev + page_num;
1043 DECLARE_COMPLETION_ONSTACK(complete);
1044
1045 BUG_ON(!page->page);
1046 bio = bio_alloc(GFP_NOFS, 1);
1047 bio->bi_bdev = page->bdev;
1048 bio->bi_sector = page->physical >> 9;
1049 bio->bi_end_io = scrub_complete_bio_end_io;
1050 bio->bi_private = &complete;
1051
1052 ret = bio_add_page(bio, page->page, PAGE_SIZE, 0);
1053 if (PAGE_SIZE != ret) {
1054 bio_put(bio);
1055 return -EIO;
1056 }
1057 btrfsic_submit_bio(READ, bio);
730 1058
731 bio = bio_alloc(GFP_NOFS, 1); 1059 /* this will also unplug the queue */
732 bio->bi_bdev = bdev; 1060 wait_for_completion(&complete);
733 bio->bi_sector = sector;
734 bio_add_page(bio, page, PAGE_SIZE, 0);
735 bio->bi_end_io = scrub_fixup_end_io;
736 bio->bi_private = &complete;
737 btrfsic_submit_bio(rw, bio);
738 1061
739 /* this will also unplug the queue */ 1062 page->io_error = !test_bit(BIO_UPTODATE, &bio->bi_flags);
740 wait_for_completion(&complete); 1063 if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
1064 sblock->no_io_error_seen = 0;
1065 bio_put(bio);
1066 }
741 1067
742 ret = !test_bit(BIO_UPTODATE, &bio->bi_flags); 1068 if (sblock->no_io_error_seen)
743 bio_put(bio); 1069 scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
744 return ret; 1070 have_csum, csum, generation,
1071 csum_size);
1072
1073 return 0;
745} 1074}
746 1075
747static void scrub_bio_end_io(struct bio *bio, int err) 1076static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1077 struct scrub_block *sblock,
1078 int is_metadata, int have_csum,
1079 const u8 *csum, u64 generation,
1080 u16 csum_size)
748{ 1081{
749 struct scrub_bio *sbio = bio->bi_private; 1082 int page_num;
750 struct scrub_dev *sdev = sbio->sdev; 1083 u8 calculated_csum[BTRFS_CSUM_SIZE];
751 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; 1084 u32 crc = ~(u32)0;
1085 struct btrfs_root *root = fs_info->extent_root;
1086 void *mapped_buffer;
1087
1088 BUG_ON(!sblock->pagev[0].page);
1089 if (is_metadata) {
1090 struct btrfs_header *h;
1091
1092 mapped_buffer = kmap_atomic(sblock->pagev[0].page);
1093 h = (struct btrfs_header *)mapped_buffer;
1094
1095 if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) ||
1096 generation != le64_to_cpu(h->generation) ||
1097 memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||
1098 memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1099 BTRFS_UUID_SIZE))
1100 sblock->header_error = 1;
1101 csum = h->csum;
1102 } else {
1103 if (!have_csum)
1104 return;
752 1105
753 sbio->err = err; 1106 mapped_buffer = kmap_atomic(sblock->pagev[0].page);
754 sbio->bio = bio; 1107 }
755 1108
756 btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work); 1109 for (page_num = 0;;) {
1110 if (page_num == 0 && is_metadata)
1111 crc = btrfs_csum_data(root,
1112 ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE,
1113 crc, PAGE_SIZE - BTRFS_CSUM_SIZE);
1114 else
1115 crc = btrfs_csum_data(root, mapped_buffer, crc,
1116 PAGE_SIZE);
1117
1118 kunmap_atomic(mapped_buffer);
1119 page_num++;
1120 if (page_num >= sblock->page_count)
1121 break;
1122 BUG_ON(!sblock->pagev[page_num].page);
1123
1124 mapped_buffer = kmap_atomic(sblock->pagev[page_num].page);
1125 }
1126
1127 btrfs_csum_final(crc, calculated_csum);
1128 if (memcmp(calculated_csum, csum, csum_size))
1129 sblock->checksum_error = 1;
757} 1130}
758 1131
759static void scrub_checksum(struct btrfs_work *work) 1132static void scrub_complete_bio_end_io(struct bio *bio, int err)
760{ 1133{
761 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); 1134 complete((struct completion *)bio->bi_private);
762 struct scrub_dev *sdev = sbio->sdev; 1135}
763 struct page *page;
764 void *buffer;
765 int i;
766 u64 flags;
767 u64 logical;
768 int ret;
769 1136
770 if (sbio->err) { 1137static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
771 ret = 0; 1138 struct scrub_block *sblock_good,
772 for (i = 0; i < sbio->count; ++i) 1139 int force_write)
773 ret |= scrub_recheck_error(sbio, i); 1140{
774 if (!ret) { 1141 int page_num;
775 spin_lock(&sdev->stat_lock); 1142 int ret = 0;
776 ++sdev->stat.unverified_errors;
777 spin_unlock(&sdev->stat_lock);
778 }
779 1143
780 sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1); 1144 for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
781 sbio->bio->bi_flags |= 1 << BIO_UPTODATE; 1145 int ret_sub;
782 sbio->bio->bi_phys_segments = 0;
783 sbio->bio->bi_idx = 0;
784 1146
785 for (i = 0; i < sbio->count; i++) { 1147 ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
786 struct bio_vec *bi; 1148 sblock_good,
787 bi = &sbio->bio->bi_io_vec[i]; 1149 page_num,
788 bi->bv_offset = 0; 1150 force_write);
789 bi->bv_len = PAGE_SIZE; 1151 if (ret_sub)
790 } 1152 ret = ret_sub;
791 goto out;
792 } 1153 }
793 for (i = 0; i < sbio->count; ++i) { 1154
794 page = sbio->bio->bi_io_vec[i].bv_page; 1155 return ret;
795 buffer = kmap_atomic(page); 1156}
796 flags = sbio->spag[i].flags; 1157
797 logical = sbio->logical + i * PAGE_SIZE; 1158static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
798 ret = 0; 1159 struct scrub_block *sblock_good,
799 if (flags & BTRFS_EXTENT_FLAG_DATA) { 1160 int page_num, int force_write)
800 ret = scrub_checksum_data(sdev, sbio->spag + i, buffer); 1161{
801 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 1162 struct scrub_page *page_bad = sblock_bad->pagev + page_num;
802 ret = scrub_checksum_tree_block(sdev, sbio->spag + i, 1163 struct scrub_page *page_good = sblock_good->pagev + page_num;
803 logical, buffer); 1164
804 } else if (flags & BTRFS_EXTENT_FLAG_SUPER) { 1165 BUG_ON(sblock_bad->pagev[page_num].page == NULL);
805 BUG_ON(i); 1166 BUG_ON(sblock_good->pagev[page_num].page == NULL);
806 (void)scrub_checksum_super(sbio, buffer); 1167 if (force_write || sblock_bad->header_error ||
807 } else { 1168 sblock_bad->checksum_error || page_bad->io_error) {
808 WARN_ON(1); 1169 struct bio *bio;
809 } 1170 int ret;
810 kunmap_atomic(buffer); 1171 DECLARE_COMPLETION_ONSTACK(complete);
811 if (ret) { 1172
812 ret = scrub_recheck_error(sbio, i); 1173 bio = bio_alloc(GFP_NOFS, 1);
813 if (!ret) { 1174 bio->bi_bdev = page_bad->bdev;
814 spin_lock(&sdev->stat_lock); 1175 bio->bi_sector = page_bad->physical >> 9;
815 ++sdev->stat.unverified_errors; 1176 bio->bi_end_io = scrub_complete_bio_end_io;
816 spin_unlock(&sdev->stat_lock); 1177 bio->bi_private = &complete;
817 } 1178
1179 ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
1180 if (PAGE_SIZE != ret) {
1181 bio_put(bio);
1182 return -EIO;
818 } 1183 }
1184 btrfsic_submit_bio(WRITE, bio);
1185
1186 /* this will also unplug the queue */
1187 wait_for_completion(&complete);
1188 bio_put(bio);
819 } 1189 }
820 1190
821out: 1191 return 0;
822 scrub_free_bio(sbio->bio); 1192}
823 sbio->bio = NULL; 1193
824 spin_lock(&sdev->list_lock); 1194static void scrub_checksum(struct scrub_block *sblock)
825 sbio->next_free = sdev->first_free; 1195{
826 sdev->first_free = sbio->index; 1196 u64 flags;
827 spin_unlock(&sdev->list_lock); 1197 int ret;
828 atomic_dec(&sdev->in_flight); 1198
829 wake_up(&sdev->list_wait); 1199 BUG_ON(sblock->page_count < 1);
1200 flags = sblock->pagev[0].flags;
1201 ret = 0;
1202 if (flags & BTRFS_EXTENT_FLAG_DATA)
1203 ret = scrub_checksum_data(sblock);
1204 else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1205 ret = scrub_checksum_tree_block(sblock);
1206 else if (flags & BTRFS_EXTENT_FLAG_SUPER)
1207 (void)scrub_checksum_super(sblock);
1208 else
1209 WARN_ON(1);
1210 if (ret)
1211 scrub_handle_errored_block(sblock);
830} 1212}
831 1213
832static int scrub_checksum_data(struct scrub_dev *sdev, 1214static int scrub_checksum_data(struct scrub_block *sblock)
833 struct scrub_page *spag, void *buffer)
834{ 1215{
1216 struct scrub_dev *sdev = sblock->sdev;
835 u8 csum[BTRFS_CSUM_SIZE]; 1217 u8 csum[BTRFS_CSUM_SIZE];
1218 u8 *on_disk_csum;
1219 struct page *page;
1220 void *buffer;
836 u32 crc = ~(u32)0; 1221 u32 crc = ~(u32)0;
837 int fail = 0; 1222 int fail = 0;
838 struct btrfs_root *root = sdev->dev->dev_root; 1223 struct btrfs_root *root = sdev->dev->dev_root;
1224 u64 len;
1225 int index;
839 1226
840 if (!spag->have_csum) 1227 BUG_ON(sblock->page_count < 1);
1228 if (!sblock->pagev[0].have_csum)
841 return 0; 1229 return 0;
842 1230
843 crc = btrfs_csum_data(root, buffer, crc, PAGE_SIZE); 1231 on_disk_csum = sblock->pagev[0].csum;
1232 page = sblock->pagev[0].page;
1233 buffer = kmap_atomic(page);
1234
1235 len = sdev->sectorsize;
1236 index = 0;
1237 for (;;) {
1238 u64 l = min_t(u64, len, PAGE_SIZE);
1239
1240 crc = btrfs_csum_data(root, buffer, crc, l);
1241 kunmap_atomic(buffer);
1242 len -= l;
1243 if (len == 0)
1244 break;
1245 index++;
1246 BUG_ON(index >= sblock->page_count);
1247 BUG_ON(!sblock->pagev[index].page);
1248 page = sblock->pagev[index].page;
1249 buffer = kmap_atomic(page);
1250 }
1251
844 btrfs_csum_final(crc, csum); 1252 btrfs_csum_final(crc, csum);
845 if (memcmp(csum, spag->csum, sdev->csum_size)) 1253 if (memcmp(csum, on_disk_csum, sdev->csum_size))
846 fail = 1; 1254 fail = 1;
847 1255
848 spin_lock(&sdev->stat_lock); 1256 if (fail) {
849 ++sdev->stat.data_extents_scrubbed; 1257 spin_lock(&sdev->stat_lock);
850 sdev->stat.data_bytes_scrubbed += PAGE_SIZE;
851 if (fail)
852 ++sdev->stat.csum_errors; 1258 ++sdev->stat.csum_errors;
853 spin_unlock(&sdev->stat_lock); 1259 spin_unlock(&sdev->stat_lock);
1260 }
854 1261
855 return fail; 1262 return fail;
856} 1263}
857 1264
858static int scrub_checksum_tree_block(struct scrub_dev *sdev, 1265static int scrub_checksum_tree_block(struct scrub_block *sblock)
859 struct scrub_page *spag, u64 logical,
860 void *buffer)
861{ 1266{
1267 struct scrub_dev *sdev = sblock->sdev;
862 struct btrfs_header *h; 1268 struct btrfs_header *h;
863 struct btrfs_root *root = sdev->dev->dev_root; 1269 struct btrfs_root *root = sdev->dev->dev_root;
864 struct btrfs_fs_info *fs_info = root->fs_info; 1270 struct btrfs_fs_info *fs_info = root->fs_info;
865 u8 csum[BTRFS_CSUM_SIZE]; 1271 u8 calculated_csum[BTRFS_CSUM_SIZE];
1272 u8 on_disk_csum[BTRFS_CSUM_SIZE];
1273 struct page *page;
1274 void *mapped_buffer;
1275 u64 mapped_size;
1276 void *p;
866 u32 crc = ~(u32)0; 1277 u32 crc = ~(u32)0;
867 int fail = 0; 1278 int fail = 0;
868 int crc_fail = 0; 1279 int crc_fail = 0;
1280 u64 len;
1281 int index;
1282
1283 BUG_ON(sblock->page_count < 1);
1284 page = sblock->pagev[0].page;
1285 mapped_buffer = kmap_atomic(page);
1286 h = (struct btrfs_header *)mapped_buffer;
1287 memcpy(on_disk_csum, h->csum, sdev->csum_size);
869 1288
870 /* 1289 /*
871 * we don't use the getter functions here, as we 1290 * we don't use the getter functions here, as we
872 * a) don't have an extent buffer and 1291 * a) don't have an extent buffer and
873 * b) the page is already kmapped 1292 * b) the page is already kmapped
874 */ 1293 */
875 h = (struct btrfs_header *)buffer;
876 1294
877 if (logical != le64_to_cpu(h->bytenr)) 1295 if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr))
878 ++fail; 1296 ++fail;
879 1297
880 if (spag->generation != le64_to_cpu(h->generation)) 1298 if (sblock->pagev[0].generation != le64_to_cpu(h->generation))
881 ++fail; 1299 ++fail;
882 1300
883 if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) 1301 if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
@@ -887,51 +1305,99 @@ static int scrub_checksum_tree_block(struct scrub_dev *sdev,
887 BTRFS_UUID_SIZE)) 1305 BTRFS_UUID_SIZE))
888 ++fail; 1306 ++fail;
889 1307
890 crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc, 1308 BUG_ON(sdev->nodesize != sdev->leafsize);
891 PAGE_SIZE - BTRFS_CSUM_SIZE); 1309 len = sdev->nodesize - BTRFS_CSUM_SIZE;
892 btrfs_csum_final(crc, csum); 1310 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
893 if (memcmp(csum, h->csum, sdev->csum_size)) 1311 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1312 index = 0;
1313 for (;;) {
1314 u64 l = min_t(u64, len, mapped_size);
1315
1316 crc = btrfs_csum_data(root, p, crc, l);
1317 kunmap_atomic(mapped_buffer);
1318 len -= l;
1319 if (len == 0)
1320 break;
1321 index++;
1322 BUG_ON(index >= sblock->page_count);
1323 BUG_ON(!sblock->pagev[index].page);
1324 page = sblock->pagev[index].page;
1325 mapped_buffer = kmap_atomic(page);
1326 mapped_size = PAGE_SIZE;
1327 p = mapped_buffer;
1328 }
1329
1330 btrfs_csum_final(crc, calculated_csum);
1331 if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size))
894 ++crc_fail; 1332 ++crc_fail;
895 1333
896 spin_lock(&sdev->stat_lock); 1334 if (crc_fail || fail) {
897 ++sdev->stat.tree_extents_scrubbed; 1335 spin_lock(&sdev->stat_lock);
898 sdev->stat.tree_bytes_scrubbed += PAGE_SIZE; 1336 if (crc_fail)
899 if (crc_fail) 1337 ++sdev->stat.csum_errors;
900 ++sdev->stat.csum_errors; 1338 if (fail)
901 if (fail) 1339 ++sdev->stat.verify_errors;
902 ++sdev->stat.verify_errors; 1340 spin_unlock(&sdev->stat_lock);
903 spin_unlock(&sdev->stat_lock); 1341 }
904 1342
905 return fail || crc_fail; 1343 return fail || crc_fail;
906} 1344}
907 1345
908static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer) 1346static int scrub_checksum_super(struct scrub_block *sblock)
909{ 1347{
910 struct btrfs_super_block *s; 1348 struct btrfs_super_block *s;
911 u64 logical; 1349 struct scrub_dev *sdev = sblock->sdev;
912 struct scrub_dev *sdev = sbio->sdev;
913 struct btrfs_root *root = sdev->dev->dev_root; 1350 struct btrfs_root *root = sdev->dev->dev_root;
914 struct btrfs_fs_info *fs_info = root->fs_info; 1351 struct btrfs_fs_info *fs_info = root->fs_info;
915 u8 csum[BTRFS_CSUM_SIZE]; 1352 u8 calculated_csum[BTRFS_CSUM_SIZE];
1353 u8 on_disk_csum[BTRFS_CSUM_SIZE];
1354 struct page *page;
1355 void *mapped_buffer;
1356 u64 mapped_size;
1357 void *p;
916 u32 crc = ~(u32)0; 1358 u32 crc = ~(u32)0;
917 int fail = 0; 1359 int fail = 0;
1360 u64 len;
1361 int index;
918 1362
919 s = (struct btrfs_super_block *)buffer; 1363 BUG_ON(sblock->page_count < 1);
920 logical = sbio->logical; 1364 page = sblock->pagev[0].page;
1365 mapped_buffer = kmap_atomic(page);
1366 s = (struct btrfs_super_block *)mapped_buffer;
1367 memcpy(on_disk_csum, s->csum, sdev->csum_size);
921 1368
922 if (logical != le64_to_cpu(s->bytenr)) 1369 if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr))
923 ++fail; 1370 ++fail;
924 1371
925 if (sbio->spag[0].generation != le64_to_cpu(s->generation)) 1372 if (sblock->pagev[0].generation != le64_to_cpu(s->generation))
926 ++fail; 1373 ++fail;
927 1374
928 if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) 1375 if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
929 ++fail; 1376 ++fail;
930 1377
931 crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc, 1378 len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
932 PAGE_SIZE - BTRFS_CSUM_SIZE); 1379 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
933 btrfs_csum_final(crc, csum); 1380 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
934 if (memcmp(csum, s->csum, sbio->sdev->csum_size)) 1381 index = 0;
1382 for (;;) {
1383 u64 l = min_t(u64, len, mapped_size);
1384
1385 crc = btrfs_csum_data(root, p, crc, l);
1386 kunmap_atomic(mapped_buffer);
1387 len -= l;
1388 if (len == 0)
1389 break;
1390 index++;
1391 BUG_ON(index >= sblock->page_count);
1392 BUG_ON(!sblock->pagev[index].page);
1393 page = sblock->pagev[index].page;
1394 mapped_buffer = kmap_atomic(page);
1395 mapped_size = PAGE_SIZE;
1396 p = mapped_buffer;
1397 }
1398
1399 btrfs_csum_final(crc, calculated_csum);
1400 if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size))
935 ++fail; 1401 ++fail;
936 1402
937 if (fail) { 1403 if (fail) {
@@ -948,29 +1414,42 @@ static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer)
948 return fail; 1414 return fail;
949} 1415}
950 1416
951static int scrub_submit(struct scrub_dev *sdev) 1417static void scrub_block_get(struct scrub_block *sblock)
1418{
1419 atomic_inc(&sblock->ref_count);
1420}
1421
1422static void scrub_block_put(struct scrub_block *sblock)
1423{
1424 if (atomic_dec_and_test(&sblock->ref_count)) {
1425 int i;
1426
1427 for (i = 0; i < sblock->page_count; i++)
1428 if (sblock->pagev[i].page)
1429 __free_page(sblock->pagev[i].page);
1430 kfree(sblock);
1431 }
1432}
1433
1434static void scrub_submit(struct scrub_dev *sdev)
952{ 1435{
953 struct scrub_bio *sbio; 1436 struct scrub_bio *sbio;
954 1437
955 if (sdev->curr == -1) 1438 if (sdev->curr == -1)
956 return 0; 1439 return;
957 1440
958 sbio = sdev->bios[sdev->curr]; 1441 sbio = sdev->bios[sdev->curr];
959 sbio->err = 0;
960 sdev->curr = -1; 1442 sdev->curr = -1;
961 atomic_inc(&sdev->in_flight); 1443 atomic_inc(&sdev->in_flight);
962 1444
963 btrfsic_submit_bio(READ, sbio->bio); 1445 btrfsic_submit_bio(READ, sbio->bio);
964
965 return 0;
966} 1446}
967 1447
968static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len, 1448static int scrub_add_page_to_bio(struct scrub_dev *sdev,
969 u64 physical, u64 flags, u64 gen, int mirror_num, 1449 struct scrub_page *spage)
970 u8 *csum, int force)
971{ 1450{
1451 struct scrub_block *sblock = spage->sblock;
972 struct scrub_bio *sbio; 1452 struct scrub_bio *sbio;
973 struct page *page;
974 int ret; 1453 int ret;
975 1454
976again: 1455again:
@@ -983,7 +1462,7 @@ again:
983 if (sdev->curr != -1) { 1462 if (sdev->curr != -1) {
984 sdev->first_free = sdev->bios[sdev->curr]->next_free; 1463 sdev->first_free = sdev->bios[sdev->curr]->next_free;
985 sdev->bios[sdev->curr]->next_free = -1; 1464 sdev->bios[sdev->curr]->next_free = -1;
986 sdev->bios[sdev->curr]->count = 0; 1465 sdev->bios[sdev->curr]->page_count = 0;
987 spin_unlock(&sdev->list_lock); 1466 spin_unlock(&sdev->list_lock);
988 } else { 1467 } else {
989 spin_unlock(&sdev->list_lock); 1468 spin_unlock(&sdev->list_lock);
@@ -991,62 +1470,200 @@ again:
991 } 1470 }
992 } 1471 }
993 sbio = sdev->bios[sdev->curr]; 1472 sbio = sdev->bios[sdev->curr];
994 if (sbio->count == 0) { 1473 if (sbio->page_count == 0) {
995 struct bio *bio; 1474 struct bio *bio;
996 1475
997 sbio->physical = physical; 1476 sbio->physical = spage->physical;
998 sbio->logical = logical; 1477 sbio->logical = spage->logical;
999 bio = bio_alloc(GFP_NOFS, SCRUB_PAGES_PER_BIO); 1478 bio = sbio->bio;
1000 if (!bio) 1479 if (!bio) {
1001 return -ENOMEM; 1480 bio = bio_alloc(GFP_NOFS, sdev->pages_per_bio);
1481 if (!bio)
1482 return -ENOMEM;
1483 sbio->bio = bio;
1484 }
1002 1485
1003 bio->bi_private = sbio; 1486 bio->bi_private = sbio;
1004 bio->bi_end_io = scrub_bio_end_io; 1487 bio->bi_end_io = scrub_bio_end_io;
1005 bio->bi_bdev = sdev->dev->bdev; 1488 bio->bi_bdev = sdev->dev->bdev;
1006 bio->bi_sector = sbio->physical >> 9; 1489 bio->bi_sector = spage->physical >> 9;
1007 sbio->err = 0; 1490 sbio->err = 0;
1008 sbio->bio = bio; 1491 } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1009 } else if (sbio->physical + sbio->count * PAGE_SIZE != physical || 1492 spage->physical ||
1010 sbio->logical + sbio->count * PAGE_SIZE != logical) { 1493 sbio->logical + sbio->page_count * PAGE_SIZE !=
1011 ret = scrub_submit(sdev); 1494 spage->logical) {
1012 if (ret) 1495 scrub_submit(sdev);
1013 return ret;
1014 goto again; 1496 goto again;
1015 } 1497 }
1016 sbio->spag[sbio->count].flags = flags;
1017 sbio->spag[sbio->count].generation = gen;
1018 sbio->spag[sbio->count].have_csum = 0;
1019 sbio->spag[sbio->count].mirror_num = mirror_num;
1020
1021 page = alloc_page(GFP_NOFS);
1022 if (!page)
1023 return -ENOMEM;
1024 1498
1025 ret = bio_add_page(sbio->bio, page, PAGE_SIZE, 0); 1499 sbio->pagev[sbio->page_count] = spage;
1026 if (!ret) { 1500 ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1027 __free_page(page); 1501 if (ret != PAGE_SIZE) {
1028 ret = scrub_submit(sdev); 1502 if (sbio->page_count < 1) {
1029 if (ret) 1503 bio_put(sbio->bio);
1030 return ret; 1504 sbio->bio = NULL;
1505 return -EIO;
1506 }
1507 scrub_submit(sdev);
1031 goto again; 1508 goto again;
1032 } 1509 }
1033 1510
1034 if (csum) { 1511 scrub_block_get(sblock); /* one for the added page */
1035 sbio->spag[sbio->count].have_csum = 1; 1512 atomic_inc(&sblock->outstanding_pages);
1036 memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size); 1513 sbio->page_count++;
1514 if (sbio->page_count == sdev->pages_per_bio)
1515 scrub_submit(sdev);
1516
1517 return 0;
1518}
1519
1520static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
1521 u64 physical, u64 flags, u64 gen, int mirror_num,
1522 u8 *csum, int force)
1523{
1524 struct scrub_block *sblock;
1525 int index;
1526
1527 sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
1528 if (!sblock) {
1529 spin_lock(&sdev->stat_lock);
1530 sdev->stat.malloc_errors++;
1531 spin_unlock(&sdev->stat_lock);
1532 return -ENOMEM;
1037 } 1533 }
1038 ++sbio->count; 1534
1039 if (sbio->count == SCRUB_PAGES_PER_BIO || force) { 1535 /* one ref inside this function, plus one for each page later on */
1536 atomic_set(&sblock->ref_count, 1);
1537 sblock->sdev = sdev;
1538 sblock->no_io_error_seen = 1;
1539
1540 for (index = 0; len > 0; index++) {
1541 struct scrub_page *spage = sblock->pagev + index;
1542 u64 l = min_t(u64, len, PAGE_SIZE);
1543
1544 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
1545 spage->page = alloc_page(GFP_NOFS);
1546 if (!spage->page) {
1547 spin_lock(&sdev->stat_lock);
1548 sdev->stat.malloc_errors++;
1549 spin_unlock(&sdev->stat_lock);
1550 while (index > 0) {
1551 index--;
1552 __free_page(sblock->pagev[index].page);
1553 }
1554 kfree(sblock);
1555 return -ENOMEM;
1556 }
1557 spage->sblock = sblock;
1558 spage->bdev = sdev->dev->bdev;
1559 spage->flags = flags;
1560 spage->generation = gen;
1561 spage->logical = logical;
1562 spage->physical = physical;
1563 spage->mirror_num = mirror_num;
1564 if (csum) {
1565 spage->have_csum = 1;
1566 memcpy(spage->csum, csum, sdev->csum_size);
1567 } else {
1568 spage->have_csum = 0;
1569 }
1570 sblock->page_count++;
1571 len -= l;
1572 logical += l;
1573 physical += l;
1574 }
1575
1576 BUG_ON(sblock->page_count == 0);
1577 for (index = 0; index < sblock->page_count; index++) {
1578 struct scrub_page *spage = sblock->pagev + index;
1040 int ret; 1579 int ret;
1041 1580
1042 ret = scrub_submit(sdev); 1581 ret = scrub_add_page_to_bio(sdev, spage);
1043 if (ret) 1582 if (ret) {
1583 scrub_block_put(sblock);
1044 return ret; 1584 return ret;
1585 }
1045 } 1586 }
1046 1587
1588 if (force)
1589 scrub_submit(sdev);
1590
1591 /* last one frees, either here or in bio completion for last page */
1592 scrub_block_put(sblock);
1047 return 0; 1593 return 0;
1048} 1594}
1049 1595
1596static void scrub_bio_end_io(struct bio *bio, int err)
1597{
1598 struct scrub_bio *sbio = bio->bi_private;
1599 struct scrub_dev *sdev = sbio->sdev;
1600 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
1601
1602 sbio->err = err;
1603 sbio->bio = bio;
1604
1605 btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
1606}
1607
1608static void scrub_bio_end_io_worker(struct btrfs_work *work)
1609{
1610 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1611 struct scrub_dev *sdev = sbio->sdev;
1612 int i;
1613
1614 BUG_ON(sbio->page_count > SCRUB_PAGES_PER_BIO);
1615 if (sbio->err) {
1616 for (i = 0; i < sbio->page_count; i++) {
1617 struct scrub_page *spage = sbio->pagev[i];
1618
1619 spage->io_error = 1;
1620 spage->sblock->no_io_error_seen = 0;
1621 }
1622 }
1623
1624 /* now complete the scrub_block items that have all pages completed */
1625 for (i = 0; i < sbio->page_count; i++) {
1626 struct scrub_page *spage = sbio->pagev[i];
1627 struct scrub_block *sblock = spage->sblock;
1628
1629 if (atomic_dec_and_test(&sblock->outstanding_pages))
1630 scrub_block_complete(sblock);
1631 scrub_block_put(sblock);
1632 }
1633
1634 if (sbio->err) {
1635 /* what is this good for??? */
1636 sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
1637 sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
1638 sbio->bio->bi_phys_segments = 0;
1639 sbio->bio->bi_idx = 0;
1640
1641 for (i = 0; i < sbio->page_count; i++) {
1642 struct bio_vec *bi;
1643 bi = &sbio->bio->bi_io_vec[i];
1644 bi->bv_offset = 0;
1645 bi->bv_len = PAGE_SIZE;
1646 }
1647 }
1648
1649 bio_put(sbio->bio);
1650 sbio->bio = NULL;
1651 spin_lock(&sdev->list_lock);
1652 sbio->next_free = sdev->first_free;
1653 sdev->first_free = sbio->index;
1654 spin_unlock(&sdev->list_lock);
1655 atomic_dec(&sdev->in_flight);
1656 wake_up(&sdev->list_wait);
1657}
1658
1659static void scrub_block_complete(struct scrub_block *sblock)
1660{
1661 if (!sblock->no_io_error_seen)
1662 scrub_handle_errored_block(sblock);
1663 else
1664 scrub_checksum(sblock);
1665}
1666
1050static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, 1667static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
1051 u8 *csum) 1668 u8 *csum)
1052{ 1669{
@@ -1054,7 +1671,6 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
1054 int ret = 0; 1671 int ret = 0;
1055 unsigned long i; 1672 unsigned long i;
1056 unsigned long num_sectors; 1673 unsigned long num_sectors;
1057 u32 sectorsize = sdev->dev->dev_root->sectorsize;
1058 1674
1059 while (!list_empty(&sdev->csum_list)) { 1675 while (!list_empty(&sdev->csum_list)) {
1060 sum = list_first_entry(&sdev->csum_list, 1676 sum = list_first_entry(&sdev->csum_list,
@@ -1072,7 +1688,7 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
1072 if (!sum) 1688 if (!sum)
1073 return 0; 1689 return 0;
1074 1690
1075 num_sectors = sum->len / sectorsize; 1691 num_sectors = sum->len / sdev->sectorsize;
1076 for (i = 0; i < num_sectors; ++i) { 1692 for (i = 0; i < num_sectors; ++i) {
1077 if (sum->sums[i].bytenr == logical) { 1693 if (sum->sums[i].bytenr == logical) {
1078 memcpy(csum, &sum->sums[i].sum, sdev->csum_size); 1694 memcpy(csum, &sum->sums[i].sum, sdev->csum_size);
@@ -1093,9 +1709,28 @@ static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
1093{ 1709{
1094 int ret; 1710 int ret;
1095 u8 csum[BTRFS_CSUM_SIZE]; 1711 u8 csum[BTRFS_CSUM_SIZE];
1712 u32 blocksize;
1713
1714 if (flags & BTRFS_EXTENT_FLAG_DATA) {
1715 blocksize = sdev->sectorsize;
1716 spin_lock(&sdev->stat_lock);
1717 sdev->stat.data_extents_scrubbed++;
1718 sdev->stat.data_bytes_scrubbed += len;
1719 spin_unlock(&sdev->stat_lock);
1720 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1721 BUG_ON(sdev->nodesize != sdev->leafsize);
1722 blocksize = sdev->nodesize;
1723 spin_lock(&sdev->stat_lock);
1724 sdev->stat.tree_extents_scrubbed++;
1725 sdev->stat.tree_bytes_scrubbed += len;
1726 spin_unlock(&sdev->stat_lock);
1727 } else {
1728 blocksize = sdev->sectorsize;
1729 BUG_ON(1);
1730 }
1096 1731
1097 while (len) { 1732 while (len) {
1098 u64 l = min_t(u64, len, PAGE_SIZE); 1733 u64 l = min_t(u64, len, blocksize);
1099 int have_csum = 0; 1734 int have_csum = 0;
1100 1735
1101 if (flags & BTRFS_EXTENT_FLAG_DATA) { 1736 if (flags & BTRFS_EXTENT_FLAG_DATA) {
@@ -1104,8 +1739,8 @@ static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
1104 if (have_csum == 0) 1739 if (have_csum == 0)
1105 ++sdev->stat.no_csum; 1740 ++sdev->stat.no_csum;
1106 } 1741 }
1107 ret = scrub_page(sdev, logical, l, physical, flags, gen, 1742 ret = scrub_pages(sdev, logical, l, physical, flags, gen,
1108 mirror_num, have_csum ? csum : NULL, 0); 1743 mirror_num, have_csum ? csum : NULL, 0);
1109 if (ret) 1744 if (ret)
1110 return ret; 1745 return ret;
1111 len -= l; 1746 len -= l;
@@ -1170,6 +1805,11 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
1170 if (!path) 1805 if (!path)
1171 return -ENOMEM; 1806 return -ENOMEM;
1172 1807
1808 /*
1809 * work on commit root. The related disk blocks are static as
1810 * long as COW is applied. This means, it is save to rewrite
1811 * them to repair disk errors without any race conditions
1812 */
1173 path->search_commit_root = 1; 1813 path->search_commit_root = 1;
1174 path->skip_locking = 1; 1814 path->skip_locking = 1;
1175 1815
@@ -1516,15 +2156,18 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
1516 struct btrfs_device *device = sdev->dev; 2156 struct btrfs_device *device = sdev->dev;
1517 struct btrfs_root *root = device->dev_root; 2157 struct btrfs_root *root = device->dev_root;
1518 2158
2159 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
2160 return -EIO;
2161
1519 gen = root->fs_info->last_trans_committed; 2162 gen = root->fs_info->last_trans_committed;
1520 2163
1521 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 2164 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
1522 bytenr = btrfs_sb_offset(i); 2165 bytenr = btrfs_sb_offset(i);
1523 if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes) 2166 if (bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes)
1524 break; 2167 break;
1525 2168
1526 ret = scrub_page(sdev, bytenr, PAGE_SIZE, bytenr, 2169 ret = scrub_pages(sdev, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
1527 BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1); 2170 BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1);
1528 if (ret) 2171 if (ret)
1529 return ret; 2172 return ret;
1530 } 2173 }
@@ -1583,10 +2226,30 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
1583 /* 2226 /*
1584 * check some assumptions 2227 * check some assumptions
1585 */ 2228 */
1586 if (root->sectorsize != PAGE_SIZE || 2229 if (root->nodesize != root->leafsize) {
1587 root->sectorsize != root->leafsize || 2230 printk(KERN_ERR
1588 root->sectorsize != root->nodesize) { 2231 "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n",
1589 printk(KERN_ERR "btrfs_scrub: size assumptions fail\n"); 2232 root->nodesize, root->leafsize);
2233 return -EINVAL;
2234 }
2235
2236 if (root->nodesize > BTRFS_STRIPE_LEN) {
2237 /*
2238 * in this case scrub is unable to calculate the checksum
2239 * the way scrub is implemented. Do not handle this
2240 * situation at all because it won't ever happen.
2241 */
2242 printk(KERN_ERR
2243 "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n",
2244 root->nodesize, BTRFS_STRIPE_LEN);
2245 return -EINVAL;
2246 }
2247
2248 if (root->sectorsize != PAGE_SIZE) {
2249 /* not supported for data w/o checksums */
2250 printk(KERN_ERR
2251 "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n",
2252 root->sectorsize, (unsigned long long)PAGE_SIZE);
1590 return -EINVAL; 2253 return -EINVAL;
1591 } 2254 }
1592 2255
@@ -1656,7 +2319,7 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
1656 return ret; 2319 return ret;
1657} 2320}
1658 2321
1659int btrfs_scrub_pause(struct btrfs_root *root) 2322void btrfs_scrub_pause(struct btrfs_root *root)
1660{ 2323{
1661 struct btrfs_fs_info *fs_info = root->fs_info; 2324 struct btrfs_fs_info *fs_info = root->fs_info;
1662 2325
@@ -1671,34 +2334,28 @@ int btrfs_scrub_pause(struct btrfs_root *root)
1671 mutex_lock(&fs_info->scrub_lock); 2334 mutex_lock(&fs_info->scrub_lock);
1672 } 2335 }
1673 mutex_unlock(&fs_info->scrub_lock); 2336 mutex_unlock(&fs_info->scrub_lock);
1674
1675 return 0;
1676} 2337}
1677 2338
1678int btrfs_scrub_continue(struct btrfs_root *root) 2339void btrfs_scrub_continue(struct btrfs_root *root)
1679{ 2340{
1680 struct btrfs_fs_info *fs_info = root->fs_info; 2341 struct btrfs_fs_info *fs_info = root->fs_info;
1681 2342
1682 atomic_dec(&fs_info->scrub_pause_req); 2343 atomic_dec(&fs_info->scrub_pause_req);
1683 wake_up(&fs_info->scrub_pause_wait); 2344 wake_up(&fs_info->scrub_pause_wait);
1684 return 0;
1685} 2345}
1686 2346
1687int btrfs_scrub_pause_super(struct btrfs_root *root) 2347void btrfs_scrub_pause_super(struct btrfs_root *root)
1688{ 2348{
1689 down_write(&root->fs_info->scrub_super_lock); 2349 down_write(&root->fs_info->scrub_super_lock);
1690 return 0;
1691} 2350}
1692 2351
1693int btrfs_scrub_continue_super(struct btrfs_root *root) 2352void btrfs_scrub_continue_super(struct btrfs_root *root)
1694{ 2353{
1695 up_write(&root->fs_info->scrub_super_lock); 2354 up_write(&root->fs_info->scrub_super_lock);
1696 return 0;
1697} 2355}
1698 2356
1699int btrfs_scrub_cancel(struct btrfs_root *root) 2357int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
1700{ 2358{
1701 struct btrfs_fs_info *fs_info = root->fs_info;
1702 2359
1703 mutex_lock(&fs_info->scrub_lock); 2360 mutex_lock(&fs_info->scrub_lock);
1704 if (!atomic_read(&fs_info->scrubs_running)) { 2361 if (!atomic_read(&fs_info->scrubs_running)) {
@@ -1719,6 +2376,11 @@ int btrfs_scrub_cancel(struct btrfs_root *root)
1719 return 0; 2376 return 0;
1720} 2377}
1721 2378
2379int btrfs_scrub_cancel(struct btrfs_root *root)
2380{
2381 return __btrfs_scrub_cancel(root->fs_info);
2382}
2383
1722int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev) 2384int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev)
1723{ 2385{
1724 struct btrfs_fs_info *fs_info = root->fs_info; 2386 struct btrfs_fs_info *fs_info = root->fs_info;
@@ -1741,6 +2403,7 @@ int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev)
1741 2403
1742 return 0; 2404 return 0;
1743} 2405}
2406
1744int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid) 2407int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid)
1745{ 2408{
1746 struct btrfs_fs_info *fs_info = root->fs_info; 2409 struct btrfs_fs_info *fs_info = root->fs_info;