aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/scrub.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/scrub.c')
-rw-r--r--fs/btrfs/scrub.c591
1 files changed, 485 insertions, 106 deletions
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index a8d03d5efb5d..ed11d3866afd 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -17,10 +17,14 @@
17 */ 17 */
18 18
19#include <linux/blkdev.h> 19#include <linux/blkdev.h>
20#include <linux/ratelimit.h>
20#include "ctree.h" 21#include "ctree.h"
21#include "volumes.h" 22#include "volumes.h"
22#include "disk-io.h" 23#include "disk-io.h"
23#include "ordered-data.h" 24#include "ordered-data.h"
25#include "transaction.h"
26#include "backref.h"
27#include "extent_io.h"
24 28
25/* 29/*
26 * This is only the first step towards a full-features scrub. It reads all 30 * This is only the first step towards a full-features scrub. It reads all
@@ -29,15 +33,12 @@
29 * any can be found. 33 * any can be found.
30 * 34 *
31 * Future enhancements: 35 * Future enhancements:
32 * - To enhance the performance, better read-ahead strategies for the
33 * extent-tree can be employed.
34 * - In case an unrepairable extent is encountered, track which files are 36 * - In case an unrepairable extent is encountered, track which files are
35 * affected and report them 37 * affected and report them
36 * - In case of a read error on files with nodatasum, map the file and read 38 * - In case of a read error on files with nodatasum, map the file and read
37 * the extent to trigger a writeback of the good copy 39 * the extent to trigger a writeback of the good copy
38 * - track and record media errors, throw out bad devices 40 * - track and record media errors, throw out bad devices
39 * - add a mode to also read unallocated space 41 * - add a mode to also read unallocated space
40 * - make the prefetch cancellable
41 */ 42 */
42 43
43struct scrub_bio; 44struct scrub_bio;
@@ -63,7 +64,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix);
63struct scrub_page { 64struct scrub_page {
64 u64 flags; /* extent flags */ 65 u64 flags; /* extent flags */
65 u64 generation; 66 u64 generation;
66 u64 mirror_num; 67 int mirror_num;
67 int have_csum; 68 int have_csum;
68 u8 csum[BTRFS_CSUM_SIZE]; 69 u8 csum[BTRFS_CSUM_SIZE];
69}; 70};
@@ -87,6 +88,7 @@ struct scrub_dev {
87 int first_free; 88 int first_free;
88 int curr; 89 int curr;
89 atomic_t in_flight; 90 atomic_t in_flight;
91 atomic_t fixup_cnt;
90 spinlock_t list_lock; 92 spinlock_t list_lock;
91 wait_queue_head_t list_wait; 93 wait_queue_head_t list_wait;
92 u16 csum_size; 94 u16 csum_size;
@@ -100,6 +102,27 @@ struct scrub_dev {
100 spinlock_t stat_lock; 102 spinlock_t stat_lock;
101}; 103};
102 104
105struct scrub_fixup_nodatasum {
106 struct scrub_dev *sdev;
107 u64 logical;
108 struct btrfs_root *root;
109 struct btrfs_work work;
110 int mirror_num;
111};
112
113struct scrub_warning {
114 struct btrfs_path *path;
115 u64 extent_item_size;
116 char *scratch_buf;
117 char *msg_buf;
118 const char *errstr;
119 sector_t sector;
120 u64 logical;
121 struct btrfs_device *dev;
122 int msg_bufsize;
123 int scratch_bufsize;
124};
125
103static void scrub_free_csums(struct scrub_dev *sdev) 126static void scrub_free_csums(struct scrub_dev *sdev)
104{ 127{
105 while (!list_empty(&sdev->csum_list)) { 128 while (!list_empty(&sdev->csum_list)) {
@@ -175,14 +198,15 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
175 198
176 if (i != SCRUB_BIOS_PER_DEV-1) 199 if (i != SCRUB_BIOS_PER_DEV-1)
177 sdev->bios[i]->next_free = i + 1; 200 sdev->bios[i]->next_free = i + 1;
178 else 201 else
179 sdev->bios[i]->next_free = -1; 202 sdev->bios[i]->next_free = -1;
180 } 203 }
181 sdev->first_free = 0; 204 sdev->first_free = 0;
182 sdev->curr = -1; 205 sdev->curr = -1;
183 atomic_set(&sdev->in_flight, 0); 206 atomic_set(&sdev->in_flight, 0);
207 atomic_set(&sdev->fixup_cnt, 0);
184 atomic_set(&sdev->cancel_req, 0); 208 atomic_set(&sdev->cancel_req, 0);
185 sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy); 209 sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy);
186 INIT_LIST_HEAD(&sdev->csum_list); 210 INIT_LIST_HEAD(&sdev->csum_list);
187 211
188 spin_lock_init(&sdev->list_lock); 212 spin_lock_init(&sdev->list_lock);
@@ -195,24 +219,361 @@ nomem:
195 return ERR_PTR(-ENOMEM); 219 return ERR_PTR(-ENOMEM);
196} 220}
197 221
222static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
223{
224 u64 isize;
225 u32 nlink;
226 int ret;
227 int i;
228 struct extent_buffer *eb;
229 struct btrfs_inode_item *inode_item;
230 struct scrub_warning *swarn = ctx;
231 struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
232 struct inode_fs_paths *ipath = NULL;
233 struct btrfs_root *local_root;
234 struct btrfs_key root_key;
235
236 root_key.objectid = root;
237 root_key.type = BTRFS_ROOT_ITEM_KEY;
238 root_key.offset = (u64)-1;
239 local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
240 if (IS_ERR(local_root)) {
241 ret = PTR_ERR(local_root);
242 goto err;
243 }
244
245 ret = inode_item_info(inum, 0, local_root, swarn->path);
246 if (ret) {
247 btrfs_release_path(swarn->path);
248 goto err;
249 }
250
251 eb = swarn->path->nodes[0];
252 inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
253 struct btrfs_inode_item);
254 isize = btrfs_inode_size(eb, inode_item);
255 nlink = btrfs_inode_nlink(eb, inode_item);
256 btrfs_release_path(swarn->path);
257
258 ipath = init_ipath(4096, local_root, swarn->path);
259 ret = paths_from_inode(inum, ipath);
260
261 if (ret < 0)
262 goto err;
263
264 /*
265 * we deliberately ignore the bit ipath might have been too small to
266 * hold all of the paths here
267 */
268 for (i = 0; i < ipath->fspath->elem_cnt; ++i)
269 printk(KERN_WARNING "btrfs: %s at logical %llu on dev "
270 "%s, sector %llu, root %llu, inode %llu, offset %llu, "
271 "length %llu, links %u (path: %s)\n", swarn->errstr,
272 swarn->logical, swarn->dev->name,
273 (unsigned long long)swarn->sector, root, inum, offset,
274 min(isize - offset, (u64)PAGE_SIZE), nlink,
275 (char *)ipath->fspath->val[i]);
276
277 free_ipath(ipath);
278 return 0;
279
280err:
281 printk(KERN_WARNING "btrfs: %s at logical %llu on dev "
282 "%s, sector %llu, root %llu, inode %llu, offset %llu: path "
283 "resolving failed with ret=%d\n", swarn->errstr,
284 swarn->logical, swarn->dev->name,
285 (unsigned long long)swarn->sector, root, inum, offset, ret);
286
287 free_ipath(ipath);
288 return 0;
289}
290
291static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
292 int ix)
293{
294 struct btrfs_device *dev = sbio->sdev->dev;
295 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
296 struct btrfs_path *path;
297 struct btrfs_key found_key;
298 struct extent_buffer *eb;
299 struct btrfs_extent_item *ei;
300 struct scrub_warning swarn;
301 u32 item_size;
302 int ret;
303 u64 ref_root;
304 u8 ref_level;
305 unsigned long ptr = 0;
306 const int bufsize = 4096;
307 u64 extent_offset;
308
309 path = btrfs_alloc_path();
310
311 swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
312 swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
313 swarn.sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
314 swarn.logical = sbio->logical + ix * PAGE_SIZE;
315 swarn.errstr = errstr;
316 swarn.dev = dev;
317 swarn.msg_bufsize = bufsize;
318 swarn.scratch_bufsize = bufsize;
319
320 if (!path || !swarn.scratch_buf || !swarn.msg_buf)
321 goto out;
322
323 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key);
324 if (ret < 0)
325 goto out;
326
327 extent_offset = swarn.logical - found_key.objectid;
328 swarn.extent_item_size = found_key.offset;
329
330 eb = path->nodes[0];
331 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
332 item_size = btrfs_item_size_nr(eb, path->slots[0]);
333
334 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
335 do {
336 ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
337 &ref_root, &ref_level);
338 printk(KERN_WARNING "%s at logical %llu on dev %s, "
339 "sector %llu: metadata %s (level %d) in tree "
340 "%llu\n", errstr, swarn.logical, dev->name,
341 (unsigned long long)swarn.sector,
342 ref_level ? "node" : "leaf",
343 ret < 0 ? -1 : ref_level,
344 ret < 0 ? -1 : ref_root);
345 } while (ret != 1);
346 } else {
347 swarn.path = path;
348 iterate_extent_inodes(fs_info, path, found_key.objectid,
349 extent_offset,
350 scrub_print_warning_inode, &swarn);
351 }
352
353out:
354 btrfs_free_path(path);
355 kfree(swarn.scratch_buf);
356 kfree(swarn.msg_buf);
357}
358
359static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
360{
361 struct page *page = NULL;
362 unsigned long index;
363 struct scrub_fixup_nodatasum *fixup = ctx;
364 int ret;
365 int corrected = 0;
366 struct btrfs_key key;
367 struct inode *inode = NULL;
368 u64 end = offset + PAGE_SIZE - 1;
369 struct btrfs_root *local_root;
370
371 key.objectid = root;
372 key.type = BTRFS_ROOT_ITEM_KEY;
373 key.offset = (u64)-1;
374 local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key);
375 if (IS_ERR(local_root))
376 return PTR_ERR(local_root);
377
378 key.type = BTRFS_INODE_ITEM_KEY;
379 key.objectid = inum;
380 key.offset = 0;
381 inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL);
382 if (IS_ERR(inode))
383 return PTR_ERR(inode);
384
385 index = offset >> PAGE_CACHE_SHIFT;
386
387 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
388 if (!page) {
389 ret = -ENOMEM;
390 goto out;
391 }
392
393 if (PageUptodate(page)) {
394 struct btrfs_mapping_tree *map_tree;
395 if (PageDirty(page)) {
396 /*
397 * we need to write the data to the defect sector. the
398 * data that was in that sector is not in memory,
399 * because the page was modified. we must not write the
400 * modified page to that sector.
401 *
402 * TODO: what could be done here: wait for the delalloc
403 * runner to write out that page (might involve
404 * COW) and see whether the sector is still
405 * referenced afterwards.
406 *
407 * For the meantime, we'll treat this error
408 * incorrectable, although there is a chance that a
409 * later scrub will find the bad sector again and that
410 * there's no dirty page in memory, then.
411 */
412 ret = -EIO;
413 goto out;
414 }
415 map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
416 ret = repair_io_failure(map_tree, offset, PAGE_SIZE,
417 fixup->logical, page,
418 fixup->mirror_num);
419 unlock_page(page);
420 corrected = !ret;
421 } else {
422 /*
423 * we need to get good data first. the general readpage path
424 * will call repair_io_failure for us, we just have to make
425 * sure we read the bad mirror.
426 */
427 ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
428 EXTENT_DAMAGED, GFP_NOFS);
429 if (ret) {
430 /* set_extent_bits should give proper error */
431 WARN_ON(ret > 0);
432 if (ret > 0)
433 ret = -EFAULT;
434 goto out;
435 }
436
437 ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
438 btrfs_get_extent,
439 fixup->mirror_num);
440 wait_on_page_locked(page);
441
442 corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
443 end, EXTENT_DAMAGED, 0, NULL);
444 if (!corrected)
445 clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
446 EXTENT_DAMAGED, GFP_NOFS);
447 }
448
449out:
450 if (page)
451 put_page(page);
452 if (inode)
453 iput(inode);
454
455 if (ret < 0)
456 return ret;
457
458 if (ret == 0 && corrected) {
459 /*
460 * we only need to call readpage for one of the inodes belonging
461 * to this extent. so make iterate_extent_inodes stop
462 */
463 return 1;
464 }
465
466 return -EIO;
467}
468
469static void scrub_fixup_nodatasum(struct btrfs_work *work)
470{
471 int ret;
472 struct scrub_fixup_nodatasum *fixup;
473 struct scrub_dev *sdev;
474 struct btrfs_trans_handle *trans = NULL;
475 struct btrfs_fs_info *fs_info;
476 struct btrfs_path *path;
477 int uncorrectable = 0;
478
479 fixup = container_of(work, struct scrub_fixup_nodatasum, work);
480 sdev = fixup->sdev;
481 fs_info = fixup->root->fs_info;
482
483 path = btrfs_alloc_path();
484 if (!path) {
485 spin_lock(&sdev->stat_lock);
486 ++sdev->stat.malloc_errors;
487 spin_unlock(&sdev->stat_lock);
488 uncorrectable = 1;
489 goto out;
490 }
491
492 trans = btrfs_join_transaction(fixup->root);
493 if (IS_ERR(trans)) {
494 uncorrectable = 1;
495 goto out;
496 }
497
498 /*
499 * the idea is to trigger a regular read through the standard path. we
500 * read a page from the (failed) logical address by specifying the
501 * corresponding copynum of the failed sector. thus, that readpage is
502 * expected to fail.
503 * that is the point where on-the-fly error correction will kick in
504 * (once it's finished) and rewrite the failed sector if a good copy
505 * can be found.
506 */
507 ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
508 path, scrub_fixup_readpage,
509 fixup);
510 if (ret < 0) {
511 uncorrectable = 1;
512 goto out;
513 }
514 WARN_ON(ret != 1);
515
516 spin_lock(&sdev->stat_lock);
517 ++sdev->stat.corrected_errors;
518 spin_unlock(&sdev->stat_lock);
519
520out:
521 if (trans && !IS_ERR(trans))
522 btrfs_end_transaction(trans, fixup->root);
523 if (uncorrectable) {
524 spin_lock(&sdev->stat_lock);
525 ++sdev->stat.uncorrectable_errors;
526 spin_unlock(&sdev->stat_lock);
527 printk_ratelimited(KERN_ERR "btrfs: unable to fixup "
528 "(nodatasum) error at logical %llu\n",
529 fixup->logical);
530 }
531
532 btrfs_free_path(path);
533 kfree(fixup);
534
535 /* see caller why we're pretending to be paused in the scrub counters */
536 mutex_lock(&fs_info->scrub_lock);
537 atomic_dec(&fs_info->scrubs_running);
538 atomic_dec(&fs_info->scrubs_paused);
539 mutex_unlock(&fs_info->scrub_lock);
540 atomic_dec(&sdev->fixup_cnt);
541 wake_up(&fs_info->scrub_pause_wait);
542 wake_up(&sdev->list_wait);
543}
544
198/* 545/*
199 * scrub_recheck_error gets called when either verification of the page 546 * scrub_recheck_error gets called when either verification of the page
200 * failed or the bio failed to read, e.g. with EIO. In the latter case, 547 * failed or the bio failed to read, e.g. with EIO. In the latter case,
201 * recheck_error gets called for every page in the bio, even though only 548 * recheck_error gets called for every page in the bio, even though only
202 * one may be bad 549 * one may be bad
203 */ 550 */
204static void scrub_recheck_error(struct scrub_bio *sbio, int ix) 551static int scrub_recheck_error(struct scrub_bio *sbio, int ix)
205{ 552{
553 struct scrub_dev *sdev = sbio->sdev;
554 u64 sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
555 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
556 DEFAULT_RATELIMIT_BURST);
557
206 if (sbio->err) { 558 if (sbio->err) {
207 if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, 559 if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector,
208 (sbio->physical + ix * PAGE_SIZE) >> 9,
209 sbio->bio->bi_io_vec[ix].bv_page) == 0) { 560 sbio->bio->bi_io_vec[ix].bv_page) == 0) {
210 if (scrub_fixup_check(sbio, ix) == 0) 561 if (scrub_fixup_check(sbio, ix) == 0)
211 return; 562 return 0;
212 } 563 }
564 if (__ratelimit(&_rs))
565 scrub_print_warning("i/o error", sbio, ix);
566 } else {
567 if (__ratelimit(&_rs))
568 scrub_print_warning("checksum error", sbio, ix);
213 } 569 }
214 570
571 spin_lock(&sdev->stat_lock);
572 ++sdev->stat.read_errors;
573 spin_unlock(&sdev->stat_lock);
574
215 scrub_fixup(sbio, ix); 575 scrub_fixup(sbio, ix);
576 return 1;
216} 577}
217 578
218static int scrub_fixup_check(struct scrub_bio *sbio, int ix) 579static int scrub_fixup_check(struct scrub_bio *sbio, int ix)
@@ -250,7 +611,8 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
250 struct scrub_dev *sdev = sbio->sdev; 611 struct scrub_dev *sdev = sbio->sdev;
251 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; 612 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
252 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 613 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
253 struct btrfs_multi_bio *multi = NULL; 614 struct btrfs_bio *bbio = NULL;
615 struct scrub_fixup_nodatasum *fixup;
254 u64 logical = sbio->logical + ix * PAGE_SIZE; 616 u64 logical = sbio->logical + ix * PAGE_SIZE;
255 u64 length; 617 u64 length;
256 int i; 618 int i;
@@ -259,38 +621,57 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
259 621
260 if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) && 622 if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) &&
261 (sbio->spag[ix].have_csum == 0)) { 623 (sbio->spag[ix].have_csum == 0)) {
624 fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
625 if (!fixup)
626 goto uncorrectable;
627 fixup->sdev = sdev;
628 fixup->logical = logical;
629 fixup->root = fs_info->extent_root;
630 fixup->mirror_num = sbio->spag[ix].mirror_num;
262 /* 631 /*
263 * nodatasum, don't try to fix anything 632 * increment scrubs_running to prevent cancel requests from
264 * FIXME: we can do better, open the inode and trigger a 633 * completing as long as a fixup worker is running. we must also
265 * writeback 634 * increment scrubs_paused to prevent deadlocking on pause
635 * requests used for transactions commits (as the worker uses a
636 * transaction context). it is safe to regard the fixup worker
637 * as paused for all matters practical. effectively, we only
638 * avoid cancellation requests from completing.
266 */ 639 */
267 goto uncorrectable; 640 mutex_lock(&fs_info->scrub_lock);
641 atomic_inc(&fs_info->scrubs_running);
642 atomic_inc(&fs_info->scrubs_paused);
643 mutex_unlock(&fs_info->scrub_lock);
644 atomic_inc(&sdev->fixup_cnt);
645 fixup->work.func = scrub_fixup_nodatasum;
646 btrfs_queue_worker(&fs_info->scrub_workers, &fixup->work);
647 return;
268 } 648 }
269 649
270 length = PAGE_SIZE; 650 length = PAGE_SIZE;
271 ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, 651 ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length,
272 &multi, 0); 652 &bbio, 0);
273 if (ret || !multi || length < PAGE_SIZE) { 653 if (ret || !bbio || length < PAGE_SIZE) {
274 printk(KERN_ERR 654 printk(KERN_ERR
275 "scrub_fixup: btrfs_map_block failed us for %llu\n", 655 "scrub_fixup: btrfs_map_block failed us for %llu\n",
276 (unsigned long long)logical); 656 (unsigned long long)logical);
277 WARN_ON(1); 657 WARN_ON(1);
658 kfree(bbio);
278 return; 659 return;
279 } 660 }
280 661
281 if (multi->num_stripes == 1) 662 if (bbio->num_stripes == 1)
282 /* there aren't any replicas */ 663 /* there aren't any replicas */
283 goto uncorrectable; 664 goto uncorrectable;
284 665
285 /* 666 /*
286 * first find a good copy 667 * first find a good copy
287 */ 668 */
288 for (i = 0; i < multi->num_stripes; ++i) { 669 for (i = 0; i < bbio->num_stripes; ++i) {
289 if (i == sbio->spag[ix].mirror_num) 670 if (i + 1 == sbio->spag[ix].mirror_num)
290 continue; 671 continue;
291 672
292 if (scrub_fixup_io(READ, multi->stripes[i].dev->bdev, 673 if (scrub_fixup_io(READ, bbio->stripes[i].dev->bdev,
293 multi->stripes[i].physical >> 9, 674 bbio->stripes[i].physical >> 9,
294 sbio->bio->bi_io_vec[ix].bv_page)) { 675 sbio->bio->bi_io_vec[ix].bv_page)) {
295 /* I/O-error, this is not a good copy */ 676 /* I/O-error, this is not a good copy */
296 continue; 677 continue;
@@ -299,7 +680,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
299 if (scrub_fixup_check(sbio, ix) == 0) 680 if (scrub_fixup_check(sbio, ix) == 0)
300 break; 681 break;
301 } 682 }
302 if (i == multi->num_stripes) 683 if (i == bbio->num_stripes)
303 goto uncorrectable; 684 goto uncorrectable;
304 685
305 if (!sdev->readonly) { 686 if (!sdev->readonly) {
@@ -314,25 +695,23 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
314 } 695 }
315 } 696 }
316 697
317 kfree(multi); 698 kfree(bbio);
318 spin_lock(&sdev->stat_lock); 699 spin_lock(&sdev->stat_lock);
319 ++sdev->stat.corrected_errors; 700 ++sdev->stat.corrected_errors;
320 spin_unlock(&sdev->stat_lock); 701 spin_unlock(&sdev->stat_lock);
321 702
322 if (printk_ratelimit()) 703 printk_ratelimited(KERN_ERR "btrfs: fixed up error at logical %llu\n",
323 printk(KERN_ERR "btrfs: fixed up at %llu\n", 704 (unsigned long long)logical);
324 (unsigned long long)logical);
325 return; 705 return;
326 706
327uncorrectable: 707uncorrectable:
328 kfree(multi); 708 kfree(bbio);
329 spin_lock(&sdev->stat_lock); 709 spin_lock(&sdev->stat_lock);
330 ++sdev->stat.uncorrectable_errors; 710 ++sdev->stat.uncorrectable_errors;
331 spin_unlock(&sdev->stat_lock); 711 spin_unlock(&sdev->stat_lock);
332 712
333 if (printk_ratelimit()) 713 printk_ratelimited(KERN_ERR "btrfs: unable to fixup (regular) error at "
334 printk(KERN_ERR "btrfs: unable to fixup at %llu\n", 714 "logical %llu\n", (unsigned long long)logical);
335 (unsigned long long)logical);
336} 715}
337 716
338static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, 717static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
@@ -382,8 +761,14 @@ static void scrub_checksum(struct btrfs_work *work)
382 int ret; 761 int ret;
383 762
384 if (sbio->err) { 763 if (sbio->err) {
764 ret = 0;
385 for (i = 0; i < sbio->count; ++i) 765 for (i = 0; i < sbio->count; ++i)
386 scrub_recheck_error(sbio, i); 766 ret |= scrub_recheck_error(sbio, i);
767 if (!ret) {
768 spin_lock(&sdev->stat_lock);
769 ++sdev->stat.unverified_errors;
770 spin_unlock(&sdev->stat_lock);
771 }
387 772
388 sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1); 773 sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
389 sbio->bio->bi_flags |= 1 << BIO_UPTODATE; 774 sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
@@ -396,10 +781,6 @@ static void scrub_checksum(struct btrfs_work *work)
396 bi->bv_offset = 0; 781 bi->bv_offset = 0;
397 bi->bv_len = PAGE_SIZE; 782 bi->bv_len = PAGE_SIZE;
398 } 783 }
399
400 spin_lock(&sdev->stat_lock);
401 ++sdev->stat.read_errors;
402 spin_unlock(&sdev->stat_lock);
403 goto out; 784 goto out;
404 } 785 }
405 for (i = 0; i < sbio->count; ++i) { 786 for (i = 0; i < sbio->count; ++i) {
@@ -420,8 +801,14 @@ static void scrub_checksum(struct btrfs_work *work)
420 WARN_ON(1); 801 WARN_ON(1);
421 } 802 }
422 kunmap_atomic(buffer, KM_USER0); 803 kunmap_atomic(buffer, KM_USER0);
423 if (ret) 804 if (ret) {
424 scrub_recheck_error(sbio, i); 805 ret = scrub_recheck_error(sbio, i);
806 if (!ret) {
807 spin_lock(&sdev->stat_lock);
808 ++sdev->stat.unverified_errors;
809 spin_unlock(&sdev->stat_lock);
810 }
811 }
425 } 812 }
426 813
427out: 814out:
@@ -604,7 +991,7 @@ nomem:
604} 991}
605 992
606static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len, 993static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
607 u64 physical, u64 flags, u64 gen, u64 mirror_num, 994 u64 physical, u64 flags, u64 gen, int mirror_num,
608 u8 *csum, int force) 995 u8 *csum, int force)
609{ 996{
610 struct scrub_bio *sbio; 997 struct scrub_bio *sbio;
@@ -701,7 +1088,7 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
701 1088
702/* scrub extent tries to collect up to 64 kB for each bio */ 1089/* scrub extent tries to collect up to 64 kB for each bio */
703static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, 1090static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
704 u64 physical, u64 flags, u64 gen, u64 mirror_num) 1091 u64 physical, u64 flags, u64 gen, int mirror_num)
705{ 1092{
706 int ret; 1093 int ret;
707 u8 csum[BTRFS_CSUM_SIZE]; 1094 u8 csum[BTRFS_CSUM_SIZE];
@@ -741,13 +1128,16 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
741 int slot; 1128 int slot;
742 int i; 1129 int i;
743 u64 nstripes; 1130 u64 nstripes;
744 int start_stripe;
745 struct extent_buffer *l; 1131 struct extent_buffer *l;
746 struct btrfs_key key; 1132 struct btrfs_key key;
747 u64 physical; 1133 u64 physical;
748 u64 logical; 1134 u64 logical;
749 u64 generation; 1135 u64 generation;
750 u64 mirror_num; 1136 int mirror_num;
1137 struct reada_control *reada1;
1138 struct reada_control *reada2;
1139 struct btrfs_key key_start;
1140 struct btrfs_key key_end;
751 1141
752 u64 increment = map->stripe_len; 1142 u64 increment = map->stripe_len;
753 u64 offset; 1143 u64 offset;
@@ -758,102 +1148,88 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
758 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 1148 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
759 offset = map->stripe_len * num; 1149 offset = map->stripe_len * num;
760 increment = map->stripe_len * map->num_stripes; 1150 increment = map->stripe_len * map->num_stripes;
761 mirror_num = 0; 1151 mirror_num = 1;
762 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 1152 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
763 int factor = map->num_stripes / map->sub_stripes; 1153 int factor = map->num_stripes / map->sub_stripes;
764 offset = map->stripe_len * (num / map->sub_stripes); 1154 offset = map->stripe_len * (num / map->sub_stripes);
765 increment = map->stripe_len * factor; 1155 increment = map->stripe_len * factor;
766 mirror_num = num % map->sub_stripes; 1156 mirror_num = num % map->sub_stripes + 1;
767 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 1157 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
768 increment = map->stripe_len; 1158 increment = map->stripe_len;
769 mirror_num = num % map->num_stripes; 1159 mirror_num = num % map->num_stripes + 1;
770 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 1160 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
771 increment = map->stripe_len; 1161 increment = map->stripe_len;
772 mirror_num = num % map->num_stripes; 1162 mirror_num = num % map->num_stripes + 1;
773 } else { 1163 } else {
774 increment = map->stripe_len; 1164 increment = map->stripe_len;
775 mirror_num = 0; 1165 mirror_num = 1;
776 } 1166 }
777 1167
778 path = btrfs_alloc_path(); 1168 path = btrfs_alloc_path();
779 if (!path) 1169 if (!path)
780 return -ENOMEM; 1170 return -ENOMEM;
781 1171
782 path->reada = 2;
783 path->search_commit_root = 1; 1172 path->search_commit_root = 1;
784 path->skip_locking = 1; 1173 path->skip_locking = 1;
785 1174
786 /* 1175 /*
787 * find all extents for each stripe and just read them to get 1176 * trigger the readahead for extent tree csum tree and wait for
788 * them into the page cache 1177 * completion. During readahead, the scrub is officially paused
789 * FIXME: we can do better. build a more intelligent prefetching 1178 * to not hold off transaction commits
790 */ 1179 */
791 logical = base + offset; 1180 logical = base + offset;
792 physical = map->stripes[num].physical;
793 ret = 0;
794 for (i = 0; i < nstripes; ++i) {
795 key.objectid = logical;
796 key.type = BTRFS_EXTENT_ITEM_KEY;
797 key.offset = (u64)0;
798
799 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
800 if (ret < 0)
801 goto out_noplug;
802
803 /*
804 * we might miss half an extent here, but that doesn't matter,
805 * as it's only the prefetch
806 */
807 while (1) {
808 l = path->nodes[0];
809 slot = path->slots[0];
810 if (slot >= btrfs_header_nritems(l)) {
811 ret = btrfs_next_leaf(root, path);
812 if (ret == 0)
813 continue;
814 if (ret < 0)
815 goto out_noplug;
816 1181
817 break; 1182 wait_event(sdev->list_wait,
818 } 1183 atomic_read(&sdev->in_flight) == 0);
819 btrfs_item_key_to_cpu(l, &key, slot); 1184 atomic_inc(&fs_info->scrubs_paused);
1185 wake_up(&fs_info->scrub_pause_wait);
820 1186
821 if (key.objectid >= logical + map->stripe_len) 1187 /* FIXME it might be better to start readahead at commit root */
822 break; 1188 key_start.objectid = logical;
1189 key_start.type = BTRFS_EXTENT_ITEM_KEY;
1190 key_start.offset = (u64)0;
1191 key_end.objectid = base + offset + nstripes * increment;
1192 key_end.type = BTRFS_EXTENT_ITEM_KEY;
1193 key_end.offset = (u64)0;
1194 reada1 = btrfs_reada_add(root, &key_start, &key_end);
1195
1196 key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
1197 key_start.type = BTRFS_EXTENT_CSUM_KEY;
1198 key_start.offset = logical;
1199 key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
1200 key_end.type = BTRFS_EXTENT_CSUM_KEY;
1201 key_end.offset = base + offset + nstripes * increment;
1202 reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
1203
1204 if (!IS_ERR(reada1))
1205 btrfs_reada_wait(reada1);
1206 if (!IS_ERR(reada2))
1207 btrfs_reada_wait(reada2);
823 1208
824 path->slots[0]++; 1209 mutex_lock(&fs_info->scrub_lock);
825 } 1210 while (atomic_read(&fs_info->scrub_pause_req)) {
826 btrfs_release_path(path); 1211 mutex_unlock(&fs_info->scrub_lock);
827 logical += increment; 1212 wait_event(fs_info->scrub_pause_wait,
828 physical += map->stripe_len; 1213 atomic_read(&fs_info->scrub_pause_req) == 0);
829 cond_resched(); 1214 mutex_lock(&fs_info->scrub_lock);
830 } 1215 }
1216 atomic_dec(&fs_info->scrubs_paused);
1217 mutex_unlock(&fs_info->scrub_lock);
1218 wake_up(&fs_info->scrub_pause_wait);
831 1219
832 /* 1220 /*
833 * collect all data csums for the stripe to avoid seeking during 1221 * collect all data csums for the stripe to avoid seeking during
834 * the scrub. This might currently (crc32) end up to be about 1MB 1222 * the scrub. This might currently (crc32) end up to be about 1MB
835 */ 1223 */
836 start_stripe = 0;
837 blk_start_plug(&plug); 1224 blk_start_plug(&plug);
838again:
839 logical = base + offset + start_stripe * increment;
840 for (i = start_stripe; i < nstripes; ++i) {
841 ret = btrfs_lookup_csums_range(csum_root, logical,
842 logical + map->stripe_len - 1,
843 &sdev->csum_list, 1);
844 if (ret)
845 goto out;
846 1225
847 logical += increment;
848 cond_resched();
849 }
850 /* 1226 /*
851 * now find all extents for each stripe and scrub them 1227 * now find all extents for each stripe and scrub them
852 */ 1228 */
853 logical = base + offset + start_stripe * increment; 1229 logical = base + offset;
854 physical = map->stripes[num].physical + start_stripe * map->stripe_len; 1230 physical = map->stripes[num].physical;
855 ret = 0; 1231 ret = 0;
856 for (i = start_stripe; i < nstripes; ++i) { 1232 for (i = 0; i < nstripes; ++i) {
857 /* 1233 /*
858 * canceled? 1234 * canceled?
859 */ 1235 */
@@ -882,11 +1258,14 @@ again:
882 atomic_dec(&fs_info->scrubs_paused); 1258 atomic_dec(&fs_info->scrubs_paused);
883 mutex_unlock(&fs_info->scrub_lock); 1259 mutex_unlock(&fs_info->scrub_lock);
884 wake_up(&fs_info->scrub_pause_wait); 1260 wake_up(&fs_info->scrub_pause_wait);
885 scrub_free_csums(sdev);
886 start_stripe = i;
887 goto again;
888 } 1261 }
889 1262
1263 ret = btrfs_lookup_csums_range(csum_root, logical,
1264 logical + map->stripe_len - 1,
1265 &sdev->csum_list, 1);
1266 if (ret)
1267 goto out;
1268
890 key.objectid = logical; 1269 key.objectid = logical;
891 key.type = BTRFS_EXTENT_ITEM_KEY; 1270 key.type = BTRFS_EXTENT_ITEM_KEY;
892 key.offset = (u64)0; 1271 key.offset = (u64)0;
@@ -982,7 +1361,6 @@ next:
982 1361
983out: 1362out:
984 blk_finish_plug(&plug); 1363 blk_finish_plug(&plug);
985out_noplug:
986 btrfs_free_path(path); 1364 btrfs_free_path(path);
987 return ret < 0 ? ret : 0; 1365 return ret < 0 ? ret : 0;
988} 1366}
@@ -1253,10 +1631,11 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
1253 ret = scrub_enumerate_chunks(sdev, start, end); 1631 ret = scrub_enumerate_chunks(sdev, start, end);
1254 1632
1255 wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); 1633 wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
1256
1257 atomic_dec(&fs_info->scrubs_running); 1634 atomic_dec(&fs_info->scrubs_running);
1258 wake_up(&fs_info->scrub_pause_wait); 1635 wake_up(&fs_info->scrub_pause_wait);
1259 1636
1637 wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0);
1638
1260 if (progress) 1639 if (progress)
1261 memcpy(progress, &sdev->stat, sizeof(*progress)); 1640 memcpy(progress, &sdev->stat, sizeof(*progress));
1262 1641