aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/scrub.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/scrub.c')
-rw-r--r--fs/btrfs/scrub.c660
1 files changed, 519 insertions, 141 deletions
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index a8d03d5efb5d..c27bcb67f330 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -17,10 +17,14 @@
17 */ 17 */
18 18
19#include <linux/blkdev.h> 19#include <linux/blkdev.h>
20#include <linux/ratelimit.h>
20#include "ctree.h" 21#include "ctree.h"
21#include "volumes.h" 22#include "volumes.h"
22#include "disk-io.h" 23#include "disk-io.h"
23#include "ordered-data.h" 24#include "ordered-data.h"
25#include "transaction.h"
26#include "backref.h"
27#include "extent_io.h"
24 28
25/* 29/*
26 * This is only the first step towards a full-features scrub. It reads all 30 * This is only the first step towards a full-features scrub. It reads all
@@ -29,15 +33,12 @@
29 * any can be found. 33 * any can be found.
30 * 34 *
31 * Future enhancements: 35 * Future enhancements:
32 * - To enhance the performance, better read-ahead strategies for the
33 * extent-tree can be employed.
34 * - In case an unrepairable extent is encountered, track which files are 36 * - In case an unrepairable extent is encountered, track which files are
35 * affected and report them 37 * affected and report them
36 * - In case of a read error on files with nodatasum, map the file and read 38 * - In case of a read error on files with nodatasum, map the file and read
37 * the extent to trigger a writeback of the good copy 39 * the extent to trigger a writeback of the good copy
38 * - track and record media errors, throw out bad devices 40 * - track and record media errors, throw out bad devices
39 * - add a mode to also read unallocated space 41 * - add a mode to also read unallocated space
40 * - make the prefetch cancellable
41 */ 42 */
42 43
43struct scrub_bio; 44struct scrub_bio;
@@ -63,7 +64,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix);
63struct scrub_page { 64struct scrub_page {
64 u64 flags; /* extent flags */ 65 u64 flags; /* extent flags */
65 u64 generation; 66 u64 generation;
66 u64 mirror_num; 67 int mirror_num;
67 int have_csum; 68 int have_csum;
68 u8 csum[BTRFS_CSUM_SIZE]; 69 u8 csum[BTRFS_CSUM_SIZE];
69}; 70};
@@ -87,6 +88,7 @@ struct scrub_dev {
87 int first_free; 88 int first_free;
88 int curr; 89 int curr;
89 atomic_t in_flight; 90 atomic_t in_flight;
91 atomic_t fixup_cnt;
90 spinlock_t list_lock; 92 spinlock_t list_lock;
91 wait_queue_head_t list_wait; 93 wait_queue_head_t list_wait;
92 u16 csum_size; 94 u16 csum_size;
@@ -100,6 +102,27 @@ struct scrub_dev {
100 spinlock_t stat_lock; 102 spinlock_t stat_lock;
101}; 103};
102 104
105struct scrub_fixup_nodatasum {
106 struct scrub_dev *sdev;
107 u64 logical;
108 struct btrfs_root *root;
109 struct btrfs_work work;
110 int mirror_num;
111};
112
113struct scrub_warning {
114 struct btrfs_path *path;
115 u64 extent_item_size;
116 char *scratch_buf;
117 char *msg_buf;
118 const char *errstr;
119 sector_t sector;
120 u64 logical;
121 struct btrfs_device *dev;
122 int msg_bufsize;
123 int scratch_bufsize;
124};
125
103static void scrub_free_csums(struct scrub_dev *sdev) 126static void scrub_free_csums(struct scrub_dev *sdev)
104{ 127{
105 while (!list_empty(&sdev->csum_list)) { 128 while (!list_empty(&sdev->csum_list)) {
@@ -175,14 +198,15 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
175 198
176 if (i != SCRUB_BIOS_PER_DEV-1) 199 if (i != SCRUB_BIOS_PER_DEV-1)
177 sdev->bios[i]->next_free = i + 1; 200 sdev->bios[i]->next_free = i + 1;
178 else 201 else
179 sdev->bios[i]->next_free = -1; 202 sdev->bios[i]->next_free = -1;
180 } 203 }
181 sdev->first_free = 0; 204 sdev->first_free = 0;
182 sdev->curr = -1; 205 sdev->curr = -1;
183 atomic_set(&sdev->in_flight, 0); 206 atomic_set(&sdev->in_flight, 0);
207 atomic_set(&sdev->fixup_cnt, 0);
184 atomic_set(&sdev->cancel_req, 0); 208 atomic_set(&sdev->cancel_req, 0);
185 sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy); 209 sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy);
186 INIT_LIST_HEAD(&sdev->csum_list); 210 INIT_LIST_HEAD(&sdev->csum_list);
187 211
188 spin_lock_init(&sdev->list_lock); 212 spin_lock_init(&sdev->list_lock);
@@ -195,24 +219,366 @@ nomem:
195 return ERR_PTR(-ENOMEM); 219 return ERR_PTR(-ENOMEM);
196} 220}
197 221
222static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
223{
224 u64 isize;
225 u32 nlink;
226 int ret;
227 int i;
228 struct extent_buffer *eb;
229 struct btrfs_inode_item *inode_item;
230 struct scrub_warning *swarn = ctx;
231 struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
232 struct inode_fs_paths *ipath = NULL;
233 struct btrfs_root *local_root;
234 struct btrfs_key root_key;
235
236 root_key.objectid = root;
237 root_key.type = BTRFS_ROOT_ITEM_KEY;
238 root_key.offset = (u64)-1;
239 local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
240 if (IS_ERR(local_root)) {
241 ret = PTR_ERR(local_root);
242 goto err;
243 }
244
245 ret = inode_item_info(inum, 0, local_root, swarn->path);
246 if (ret) {
247 btrfs_release_path(swarn->path);
248 goto err;
249 }
250
251 eb = swarn->path->nodes[0];
252 inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
253 struct btrfs_inode_item);
254 isize = btrfs_inode_size(eb, inode_item);
255 nlink = btrfs_inode_nlink(eb, inode_item);
256 btrfs_release_path(swarn->path);
257
258 ipath = init_ipath(4096, local_root, swarn->path);
259 if (IS_ERR(ipath)) {
260 ret = PTR_ERR(ipath);
261 ipath = NULL;
262 goto err;
263 }
264 ret = paths_from_inode(inum, ipath);
265
266 if (ret < 0)
267 goto err;
268
269 /*
270 * we deliberately ignore the bit ipath might have been too small to
271 * hold all of the paths here
272 */
273 for (i = 0; i < ipath->fspath->elem_cnt; ++i)
274 printk(KERN_WARNING "btrfs: %s at logical %llu on dev "
275 "%s, sector %llu, root %llu, inode %llu, offset %llu, "
276 "length %llu, links %u (path: %s)\n", swarn->errstr,
277 swarn->logical, swarn->dev->name,
278 (unsigned long long)swarn->sector, root, inum, offset,
279 min(isize - offset, (u64)PAGE_SIZE), nlink,
280 (char *)(unsigned long)ipath->fspath->val[i]);
281
282 free_ipath(ipath);
283 return 0;
284
285err:
286 printk(KERN_WARNING "btrfs: %s at logical %llu on dev "
287 "%s, sector %llu, root %llu, inode %llu, offset %llu: path "
288 "resolving failed with ret=%d\n", swarn->errstr,
289 swarn->logical, swarn->dev->name,
290 (unsigned long long)swarn->sector, root, inum, offset, ret);
291
292 free_ipath(ipath);
293 return 0;
294}
295
296static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
297 int ix)
298{
299 struct btrfs_device *dev = sbio->sdev->dev;
300 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
301 struct btrfs_path *path;
302 struct btrfs_key found_key;
303 struct extent_buffer *eb;
304 struct btrfs_extent_item *ei;
305 struct scrub_warning swarn;
306 u32 item_size;
307 int ret;
308 u64 ref_root;
309 u8 ref_level;
310 unsigned long ptr = 0;
311 const int bufsize = 4096;
312 u64 extent_offset;
313
314 path = btrfs_alloc_path();
315
316 swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
317 swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
318 swarn.sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
319 swarn.logical = sbio->logical + ix * PAGE_SIZE;
320 swarn.errstr = errstr;
321 swarn.dev = dev;
322 swarn.msg_bufsize = bufsize;
323 swarn.scratch_bufsize = bufsize;
324
325 if (!path || !swarn.scratch_buf || !swarn.msg_buf)
326 goto out;
327
328 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key);
329 if (ret < 0)
330 goto out;
331
332 extent_offset = swarn.logical - found_key.objectid;
333 swarn.extent_item_size = found_key.offset;
334
335 eb = path->nodes[0];
336 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
337 item_size = btrfs_item_size_nr(eb, path->slots[0]);
338
339 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
340 do {
341 ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
342 &ref_root, &ref_level);
343 printk(KERN_WARNING "%s at logical %llu on dev %s, "
344 "sector %llu: metadata %s (level %d) in tree "
345 "%llu\n", errstr, swarn.logical, dev->name,
346 (unsigned long long)swarn.sector,
347 ref_level ? "node" : "leaf",
348 ret < 0 ? -1 : ref_level,
349 ret < 0 ? -1 : ref_root);
350 } while (ret != 1);
351 } else {
352 swarn.path = path;
353 iterate_extent_inodes(fs_info, path, found_key.objectid,
354 extent_offset,
355 scrub_print_warning_inode, &swarn);
356 }
357
358out:
359 btrfs_free_path(path);
360 kfree(swarn.scratch_buf);
361 kfree(swarn.msg_buf);
362}
363
364static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
365{
366 struct page *page = NULL;
367 unsigned long index;
368 struct scrub_fixup_nodatasum *fixup = ctx;
369 int ret;
370 int corrected = 0;
371 struct btrfs_key key;
372 struct inode *inode = NULL;
373 u64 end = offset + PAGE_SIZE - 1;
374 struct btrfs_root *local_root;
375
376 key.objectid = root;
377 key.type = BTRFS_ROOT_ITEM_KEY;
378 key.offset = (u64)-1;
379 local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key);
380 if (IS_ERR(local_root))
381 return PTR_ERR(local_root);
382
383 key.type = BTRFS_INODE_ITEM_KEY;
384 key.objectid = inum;
385 key.offset = 0;
386 inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL);
387 if (IS_ERR(inode))
388 return PTR_ERR(inode);
389
390 index = offset >> PAGE_CACHE_SHIFT;
391
392 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
393 if (!page) {
394 ret = -ENOMEM;
395 goto out;
396 }
397
398 if (PageUptodate(page)) {
399 struct btrfs_mapping_tree *map_tree;
400 if (PageDirty(page)) {
401 /*
402 * we need to write the data to the defect sector. the
403 * data that was in that sector is not in memory,
404 * because the page was modified. we must not write the
405 * modified page to that sector.
406 *
407 * TODO: what could be done here: wait for the delalloc
408 * runner to write out that page (might involve
409 * COW) and see whether the sector is still
410 * referenced afterwards.
411 *
412 * For the meantime, we'll treat this error
413 * incorrectable, although there is a chance that a
414 * later scrub will find the bad sector again and that
415 * there's no dirty page in memory, then.
416 */
417 ret = -EIO;
418 goto out;
419 }
420 map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
421 ret = repair_io_failure(map_tree, offset, PAGE_SIZE,
422 fixup->logical, page,
423 fixup->mirror_num);
424 unlock_page(page);
425 corrected = !ret;
426 } else {
427 /*
428 * we need to get good data first. the general readpage path
429 * will call repair_io_failure for us, we just have to make
430 * sure we read the bad mirror.
431 */
432 ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
433 EXTENT_DAMAGED, GFP_NOFS);
434 if (ret) {
435 /* set_extent_bits should give proper error */
436 WARN_ON(ret > 0);
437 if (ret > 0)
438 ret = -EFAULT;
439 goto out;
440 }
441
442 ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
443 btrfs_get_extent,
444 fixup->mirror_num);
445 wait_on_page_locked(page);
446
447 corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
448 end, EXTENT_DAMAGED, 0, NULL);
449 if (!corrected)
450 clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
451 EXTENT_DAMAGED, GFP_NOFS);
452 }
453
454out:
455 if (page)
456 put_page(page);
457 if (inode)
458 iput(inode);
459
460 if (ret < 0)
461 return ret;
462
463 if (ret == 0 && corrected) {
464 /*
465 * we only need to call readpage for one of the inodes belonging
466 * to this extent. so make iterate_extent_inodes stop
467 */
468 return 1;
469 }
470
471 return -EIO;
472}
473
474static void scrub_fixup_nodatasum(struct btrfs_work *work)
475{
476 int ret;
477 struct scrub_fixup_nodatasum *fixup;
478 struct scrub_dev *sdev;
479 struct btrfs_trans_handle *trans = NULL;
480 struct btrfs_fs_info *fs_info;
481 struct btrfs_path *path;
482 int uncorrectable = 0;
483
484 fixup = container_of(work, struct scrub_fixup_nodatasum, work);
485 sdev = fixup->sdev;
486 fs_info = fixup->root->fs_info;
487
488 path = btrfs_alloc_path();
489 if (!path) {
490 spin_lock(&sdev->stat_lock);
491 ++sdev->stat.malloc_errors;
492 spin_unlock(&sdev->stat_lock);
493 uncorrectable = 1;
494 goto out;
495 }
496
497 trans = btrfs_join_transaction(fixup->root);
498 if (IS_ERR(trans)) {
499 uncorrectable = 1;
500 goto out;
501 }
502
503 /*
504 * the idea is to trigger a regular read through the standard path. we
505 * read a page from the (failed) logical address by specifying the
506 * corresponding copynum of the failed sector. thus, that readpage is
507 * expected to fail.
508 * that is the point where on-the-fly error correction will kick in
509 * (once it's finished) and rewrite the failed sector if a good copy
510 * can be found.
511 */
512 ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
513 path, scrub_fixup_readpage,
514 fixup);
515 if (ret < 0) {
516 uncorrectable = 1;
517 goto out;
518 }
519 WARN_ON(ret != 1);
520
521 spin_lock(&sdev->stat_lock);
522 ++sdev->stat.corrected_errors;
523 spin_unlock(&sdev->stat_lock);
524
525out:
526 if (trans && !IS_ERR(trans))
527 btrfs_end_transaction(trans, fixup->root);
528 if (uncorrectable) {
529 spin_lock(&sdev->stat_lock);
530 ++sdev->stat.uncorrectable_errors;
531 spin_unlock(&sdev->stat_lock);
532 printk_ratelimited(KERN_ERR "btrfs: unable to fixup "
533 "(nodatasum) error at logical %llu\n",
534 fixup->logical);
535 }
536
537 btrfs_free_path(path);
538 kfree(fixup);
539
540 /* see caller why we're pretending to be paused in the scrub counters */
541 mutex_lock(&fs_info->scrub_lock);
542 atomic_dec(&fs_info->scrubs_running);
543 atomic_dec(&fs_info->scrubs_paused);
544 mutex_unlock(&fs_info->scrub_lock);
545 atomic_dec(&sdev->fixup_cnt);
546 wake_up(&fs_info->scrub_pause_wait);
547 wake_up(&sdev->list_wait);
548}
549
198/* 550/*
199 * scrub_recheck_error gets called when either verification of the page 551 * scrub_recheck_error gets called when either verification of the page
200 * failed or the bio failed to read, e.g. with EIO. In the latter case, 552 * failed or the bio failed to read, e.g. with EIO. In the latter case,
201 * recheck_error gets called for every page in the bio, even though only 553 * recheck_error gets called for every page in the bio, even though only
202 * one may be bad 554 * one may be bad
203 */ 555 */
204static void scrub_recheck_error(struct scrub_bio *sbio, int ix) 556static int scrub_recheck_error(struct scrub_bio *sbio, int ix)
205{ 557{
558 struct scrub_dev *sdev = sbio->sdev;
559 u64 sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
560 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
561 DEFAULT_RATELIMIT_BURST);
562
206 if (sbio->err) { 563 if (sbio->err) {
207 if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, 564 if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector,
208 (sbio->physical + ix * PAGE_SIZE) >> 9,
209 sbio->bio->bi_io_vec[ix].bv_page) == 0) { 565 sbio->bio->bi_io_vec[ix].bv_page) == 0) {
210 if (scrub_fixup_check(sbio, ix) == 0) 566 if (scrub_fixup_check(sbio, ix) == 0)
211 return; 567 return 0;
212 } 568 }
569 if (__ratelimit(&_rs))
570 scrub_print_warning("i/o error", sbio, ix);
571 } else {
572 if (__ratelimit(&_rs))
573 scrub_print_warning("checksum error", sbio, ix);
213 } 574 }
214 575
576 spin_lock(&sdev->stat_lock);
577 ++sdev->stat.read_errors;
578 spin_unlock(&sdev->stat_lock);
579
215 scrub_fixup(sbio, ix); 580 scrub_fixup(sbio, ix);
581 return 1;
216} 582}
217 583
218static int scrub_fixup_check(struct scrub_bio *sbio, int ix) 584static int scrub_fixup_check(struct scrub_bio *sbio, int ix)
@@ -250,7 +616,8 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
250 struct scrub_dev *sdev = sbio->sdev; 616 struct scrub_dev *sdev = sbio->sdev;
251 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; 617 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
252 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 618 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
253 struct btrfs_multi_bio *multi = NULL; 619 struct btrfs_bio *bbio = NULL;
620 struct scrub_fixup_nodatasum *fixup;
254 u64 logical = sbio->logical + ix * PAGE_SIZE; 621 u64 logical = sbio->logical + ix * PAGE_SIZE;
255 u64 length; 622 u64 length;
256 int i; 623 int i;
@@ -259,38 +626,57 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
259 626
260 if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) && 627 if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) &&
261 (sbio->spag[ix].have_csum == 0)) { 628 (sbio->spag[ix].have_csum == 0)) {
629 fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
630 if (!fixup)
631 goto uncorrectable;
632 fixup->sdev = sdev;
633 fixup->logical = logical;
634 fixup->root = fs_info->extent_root;
635 fixup->mirror_num = sbio->spag[ix].mirror_num;
262 /* 636 /*
263 * nodatasum, don't try to fix anything 637 * increment scrubs_running to prevent cancel requests from
264 * FIXME: we can do better, open the inode and trigger a 638 * completing as long as a fixup worker is running. we must also
265 * writeback 639 * increment scrubs_paused to prevent deadlocking on pause
640 * requests used for transactions commits (as the worker uses a
641 * transaction context). it is safe to regard the fixup worker
642 * as paused for all matters practical. effectively, we only
643 * avoid cancellation requests from completing.
266 */ 644 */
267 goto uncorrectable; 645 mutex_lock(&fs_info->scrub_lock);
646 atomic_inc(&fs_info->scrubs_running);
647 atomic_inc(&fs_info->scrubs_paused);
648 mutex_unlock(&fs_info->scrub_lock);
649 atomic_inc(&sdev->fixup_cnt);
650 fixup->work.func = scrub_fixup_nodatasum;
651 btrfs_queue_worker(&fs_info->scrub_workers, &fixup->work);
652 return;
268 } 653 }
269 654
270 length = PAGE_SIZE; 655 length = PAGE_SIZE;
271 ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, 656 ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length,
272 &multi, 0); 657 &bbio, 0);
273 if (ret || !multi || length < PAGE_SIZE) { 658 if (ret || !bbio || length < PAGE_SIZE) {
274 printk(KERN_ERR 659 printk(KERN_ERR
275 "scrub_fixup: btrfs_map_block failed us for %llu\n", 660 "scrub_fixup: btrfs_map_block failed us for %llu\n",
276 (unsigned long long)logical); 661 (unsigned long long)logical);
277 WARN_ON(1); 662 WARN_ON(1);
663 kfree(bbio);
278 return; 664 return;
279 } 665 }
280 666
281 if (multi->num_stripes == 1) 667 if (bbio->num_stripes == 1)
282 /* there aren't any replicas */ 668 /* there aren't any replicas */
283 goto uncorrectable; 669 goto uncorrectable;
284 670
285 /* 671 /*
286 * first find a good copy 672 * first find a good copy
287 */ 673 */
288 for (i = 0; i < multi->num_stripes; ++i) { 674 for (i = 0; i < bbio->num_stripes; ++i) {
289 if (i == sbio->spag[ix].mirror_num) 675 if (i + 1 == sbio->spag[ix].mirror_num)
290 continue; 676 continue;
291 677
292 if (scrub_fixup_io(READ, multi->stripes[i].dev->bdev, 678 if (scrub_fixup_io(READ, bbio->stripes[i].dev->bdev,
293 multi->stripes[i].physical >> 9, 679 bbio->stripes[i].physical >> 9,
294 sbio->bio->bi_io_vec[ix].bv_page)) { 680 sbio->bio->bi_io_vec[ix].bv_page)) {
295 /* I/O-error, this is not a good copy */ 681 /* I/O-error, this is not a good copy */
296 continue; 682 continue;
@@ -299,7 +685,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
299 if (scrub_fixup_check(sbio, ix) == 0) 685 if (scrub_fixup_check(sbio, ix) == 0)
300 break; 686 break;
301 } 687 }
302 if (i == multi->num_stripes) 688 if (i == bbio->num_stripes)
303 goto uncorrectable; 689 goto uncorrectable;
304 690
305 if (!sdev->readonly) { 691 if (!sdev->readonly) {
@@ -314,25 +700,23 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
314 } 700 }
315 } 701 }
316 702
317 kfree(multi); 703 kfree(bbio);
318 spin_lock(&sdev->stat_lock); 704 spin_lock(&sdev->stat_lock);
319 ++sdev->stat.corrected_errors; 705 ++sdev->stat.corrected_errors;
320 spin_unlock(&sdev->stat_lock); 706 spin_unlock(&sdev->stat_lock);
321 707
322 if (printk_ratelimit()) 708 printk_ratelimited(KERN_ERR "btrfs: fixed up error at logical %llu\n",
323 printk(KERN_ERR "btrfs: fixed up at %llu\n", 709 (unsigned long long)logical);
324 (unsigned long long)logical);
325 return; 710 return;
326 711
327uncorrectable: 712uncorrectable:
328 kfree(multi); 713 kfree(bbio);
329 spin_lock(&sdev->stat_lock); 714 spin_lock(&sdev->stat_lock);
330 ++sdev->stat.uncorrectable_errors; 715 ++sdev->stat.uncorrectable_errors;
331 spin_unlock(&sdev->stat_lock); 716 spin_unlock(&sdev->stat_lock);
332 717
333 if (printk_ratelimit()) 718 printk_ratelimited(KERN_ERR "btrfs: unable to fixup (regular) error at "
334 printk(KERN_ERR "btrfs: unable to fixup at %llu\n", 719 "logical %llu\n", (unsigned long long)logical);
335 (unsigned long long)logical);
336} 720}
337 721
338static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, 722static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
@@ -382,8 +766,14 @@ static void scrub_checksum(struct btrfs_work *work)
382 int ret; 766 int ret;
383 767
384 if (sbio->err) { 768 if (sbio->err) {
769 ret = 0;
385 for (i = 0; i < sbio->count; ++i) 770 for (i = 0; i < sbio->count; ++i)
386 scrub_recheck_error(sbio, i); 771 ret |= scrub_recheck_error(sbio, i);
772 if (!ret) {
773 spin_lock(&sdev->stat_lock);
774 ++sdev->stat.unverified_errors;
775 spin_unlock(&sdev->stat_lock);
776 }
387 777
388 sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1); 778 sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
389 sbio->bio->bi_flags |= 1 << BIO_UPTODATE; 779 sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
@@ -396,10 +786,6 @@ static void scrub_checksum(struct btrfs_work *work)
396 bi->bv_offset = 0; 786 bi->bv_offset = 0;
397 bi->bv_len = PAGE_SIZE; 787 bi->bv_len = PAGE_SIZE;
398 } 788 }
399
400 spin_lock(&sdev->stat_lock);
401 ++sdev->stat.read_errors;
402 spin_unlock(&sdev->stat_lock);
403 goto out; 789 goto out;
404 } 790 }
405 for (i = 0; i < sbio->count; ++i) { 791 for (i = 0; i < sbio->count; ++i) {
@@ -420,8 +806,14 @@ static void scrub_checksum(struct btrfs_work *work)
420 WARN_ON(1); 806 WARN_ON(1);
421 } 807 }
422 kunmap_atomic(buffer, KM_USER0); 808 kunmap_atomic(buffer, KM_USER0);
423 if (ret) 809 if (ret) {
424 scrub_recheck_error(sbio, i); 810 ret = scrub_recheck_error(sbio, i);
811 if (!ret) {
812 spin_lock(&sdev->stat_lock);
813 ++sdev->stat.unverified_errors;
814 spin_unlock(&sdev->stat_lock);
815 }
816 }
425 } 817 }
426 818
427out: 819out:
@@ -557,57 +949,27 @@ static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer)
557static int scrub_submit(struct scrub_dev *sdev) 949static int scrub_submit(struct scrub_dev *sdev)
558{ 950{
559 struct scrub_bio *sbio; 951 struct scrub_bio *sbio;
560 struct bio *bio;
561 int i;
562 952
563 if (sdev->curr == -1) 953 if (sdev->curr == -1)
564 return 0; 954 return 0;
565 955
566 sbio = sdev->bios[sdev->curr]; 956 sbio = sdev->bios[sdev->curr];
567
568 bio = bio_alloc(GFP_NOFS, sbio->count);
569 if (!bio)
570 goto nomem;
571
572 bio->bi_private = sbio;
573 bio->bi_end_io = scrub_bio_end_io;
574 bio->bi_bdev = sdev->dev->bdev;
575 bio->bi_sector = sbio->physical >> 9;
576
577 for (i = 0; i < sbio->count; ++i) {
578 struct page *page;
579 int ret;
580
581 page = alloc_page(GFP_NOFS);
582 if (!page)
583 goto nomem;
584
585 ret = bio_add_page(bio, page, PAGE_SIZE, 0);
586 if (!ret) {
587 __free_page(page);
588 goto nomem;
589 }
590 }
591
592 sbio->err = 0; 957 sbio->err = 0;
593 sdev->curr = -1; 958 sdev->curr = -1;
594 atomic_inc(&sdev->in_flight); 959 atomic_inc(&sdev->in_flight);
595 960
596 submit_bio(READ, bio); 961 submit_bio(READ, sbio->bio);
597 962
598 return 0; 963 return 0;
599
600nomem:
601 scrub_free_bio(bio);
602
603 return -ENOMEM;
604} 964}
605 965
606static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len, 966static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
607 u64 physical, u64 flags, u64 gen, u64 mirror_num, 967 u64 physical, u64 flags, u64 gen, int mirror_num,
608 u8 *csum, int force) 968 u8 *csum, int force)
609{ 969{
610 struct scrub_bio *sbio; 970 struct scrub_bio *sbio;
971 struct page *page;
972 int ret;
611 973
612again: 974again:
613 /* 975 /*
@@ -628,12 +990,22 @@ again:
628 } 990 }
629 sbio = sdev->bios[sdev->curr]; 991 sbio = sdev->bios[sdev->curr];
630 if (sbio->count == 0) { 992 if (sbio->count == 0) {
993 struct bio *bio;
994
631 sbio->physical = physical; 995 sbio->physical = physical;
632 sbio->logical = logical; 996 sbio->logical = logical;
997 bio = bio_alloc(GFP_NOFS, SCRUB_PAGES_PER_BIO);
998 if (!bio)
999 return -ENOMEM;
1000
1001 bio->bi_private = sbio;
1002 bio->bi_end_io = scrub_bio_end_io;
1003 bio->bi_bdev = sdev->dev->bdev;
1004 bio->bi_sector = sbio->physical >> 9;
1005 sbio->err = 0;
1006 sbio->bio = bio;
633 } else if (sbio->physical + sbio->count * PAGE_SIZE != physical || 1007 } else if (sbio->physical + sbio->count * PAGE_SIZE != physical ||
634 sbio->logical + sbio->count * PAGE_SIZE != logical) { 1008 sbio->logical + sbio->count * PAGE_SIZE != logical) {
635 int ret;
636
637 ret = scrub_submit(sdev); 1009 ret = scrub_submit(sdev);
638 if (ret) 1010 if (ret)
639 return ret; 1011 return ret;
@@ -643,6 +1015,20 @@ again:
643 sbio->spag[sbio->count].generation = gen; 1015 sbio->spag[sbio->count].generation = gen;
644 sbio->spag[sbio->count].have_csum = 0; 1016 sbio->spag[sbio->count].have_csum = 0;
645 sbio->spag[sbio->count].mirror_num = mirror_num; 1017 sbio->spag[sbio->count].mirror_num = mirror_num;
1018
1019 page = alloc_page(GFP_NOFS);
1020 if (!page)
1021 return -ENOMEM;
1022
1023 ret = bio_add_page(sbio->bio, page, PAGE_SIZE, 0);
1024 if (!ret) {
1025 __free_page(page);
1026 ret = scrub_submit(sdev);
1027 if (ret)
1028 return ret;
1029 goto again;
1030 }
1031
646 if (csum) { 1032 if (csum) {
647 sbio->spag[sbio->count].have_csum = 1; 1033 sbio->spag[sbio->count].have_csum = 1;
648 memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size); 1034 memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size);
@@ -701,7 +1087,7 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
701 1087
702/* scrub extent tries to collect up to 64 kB for each bio */ 1088/* scrub extent tries to collect up to 64 kB for each bio */
703static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, 1089static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
704 u64 physical, u64 flags, u64 gen, u64 mirror_num) 1090 u64 physical, u64 flags, u64 gen, int mirror_num)
705{ 1091{
706 int ret; 1092 int ret;
707 u8 csum[BTRFS_CSUM_SIZE]; 1093 u8 csum[BTRFS_CSUM_SIZE];
@@ -741,13 +1127,16 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
741 int slot; 1127 int slot;
742 int i; 1128 int i;
743 u64 nstripes; 1129 u64 nstripes;
744 int start_stripe;
745 struct extent_buffer *l; 1130 struct extent_buffer *l;
746 struct btrfs_key key; 1131 struct btrfs_key key;
747 u64 physical; 1132 u64 physical;
748 u64 logical; 1133 u64 logical;
749 u64 generation; 1134 u64 generation;
750 u64 mirror_num; 1135 int mirror_num;
1136 struct reada_control *reada1;
1137 struct reada_control *reada2;
1138 struct btrfs_key key_start;
1139 struct btrfs_key key_end;
751 1140
752 u64 increment = map->stripe_len; 1141 u64 increment = map->stripe_len;
753 u64 offset; 1142 u64 offset;
@@ -758,102 +1147,88 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
758 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 1147 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
759 offset = map->stripe_len * num; 1148 offset = map->stripe_len * num;
760 increment = map->stripe_len * map->num_stripes; 1149 increment = map->stripe_len * map->num_stripes;
761 mirror_num = 0; 1150 mirror_num = 1;
762 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 1151 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
763 int factor = map->num_stripes / map->sub_stripes; 1152 int factor = map->num_stripes / map->sub_stripes;
764 offset = map->stripe_len * (num / map->sub_stripes); 1153 offset = map->stripe_len * (num / map->sub_stripes);
765 increment = map->stripe_len * factor; 1154 increment = map->stripe_len * factor;
766 mirror_num = num % map->sub_stripes; 1155 mirror_num = num % map->sub_stripes + 1;
767 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 1156 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
768 increment = map->stripe_len; 1157 increment = map->stripe_len;
769 mirror_num = num % map->num_stripes; 1158 mirror_num = num % map->num_stripes + 1;
770 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 1159 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
771 increment = map->stripe_len; 1160 increment = map->stripe_len;
772 mirror_num = num % map->num_stripes; 1161 mirror_num = num % map->num_stripes + 1;
773 } else { 1162 } else {
774 increment = map->stripe_len; 1163 increment = map->stripe_len;
775 mirror_num = 0; 1164 mirror_num = 1;
776 } 1165 }
777 1166
778 path = btrfs_alloc_path(); 1167 path = btrfs_alloc_path();
779 if (!path) 1168 if (!path)
780 return -ENOMEM; 1169 return -ENOMEM;
781 1170
782 path->reada = 2;
783 path->search_commit_root = 1; 1171 path->search_commit_root = 1;
784 path->skip_locking = 1; 1172 path->skip_locking = 1;
785 1173
786 /* 1174 /*
787 * find all extents for each stripe and just read them to get 1175 * trigger the readahead for extent tree csum tree and wait for
788 * them into the page cache 1176 * completion. During readahead, the scrub is officially paused
789 * FIXME: we can do better. build a more intelligent prefetching 1177 * to not hold off transaction commits
790 */ 1178 */
791 logical = base + offset; 1179 logical = base + offset;
792 physical = map->stripes[num].physical;
793 ret = 0;
794 for (i = 0; i < nstripes; ++i) {
795 key.objectid = logical;
796 key.type = BTRFS_EXTENT_ITEM_KEY;
797 key.offset = (u64)0;
798
799 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
800 if (ret < 0)
801 goto out_noplug;
802
803 /*
804 * we might miss half an extent here, but that doesn't matter,
805 * as it's only the prefetch
806 */
807 while (1) {
808 l = path->nodes[0];
809 slot = path->slots[0];
810 if (slot >= btrfs_header_nritems(l)) {
811 ret = btrfs_next_leaf(root, path);
812 if (ret == 0)
813 continue;
814 if (ret < 0)
815 goto out_noplug;
816 1180
817 break; 1181 wait_event(sdev->list_wait,
818 } 1182 atomic_read(&sdev->in_flight) == 0);
819 btrfs_item_key_to_cpu(l, &key, slot); 1183 atomic_inc(&fs_info->scrubs_paused);
1184 wake_up(&fs_info->scrub_pause_wait);
820 1185
821 if (key.objectid >= logical + map->stripe_len) 1186 /* FIXME it might be better to start readahead at commit root */
822 break; 1187 key_start.objectid = logical;
1188 key_start.type = BTRFS_EXTENT_ITEM_KEY;
1189 key_start.offset = (u64)0;
1190 key_end.objectid = base + offset + nstripes * increment;
1191 key_end.type = BTRFS_EXTENT_ITEM_KEY;
1192 key_end.offset = (u64)0;
1193 reada1 = btrfs_reada_add(root, &key_start, &key_end);
1194
1195 key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
1196 key_start.type = BTRFS_EXTENT_CSUM_KEY;
1197 key_start.offset = logical;
1198 key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
1199 key_end.type = BTRFS_EXTENT_CSUM_KEY;
1200 key_end.offset = base + offset + nstripes * increment;
1201 reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
1202
1203 if (!IS_ERR(reada1))
1204 btrfs_reada_wait(reada1);
1205 if (!IS_ERR(reada2))
1206 btrfs_reada_wait(reada2);
823 1207
824 path->slots[0]++; 1208 mutex_lock(&fs_info->scrub_lock);
825 } 1209 while (atomic_read(&fs_info->scrub_pause_req)) {
826 btrfs_release_path(path); 1210 mutex_unlock(&fs_info->scrub_lock);
827 logical += increment; 1211 wait_event(fs_info->scrub_pause_wait,
828 physical += map->stripe_len; 1212 atomic_read(&fs_info->scrub_pause_req) == 0);
829 cond_resched(); 1213 mutex_lock(&fs_info->scrub_lock);
830 } 1214 }
1215 atomic_dec(&fs_info->scrubs_paused);
1216 mutex_unlock(&fs_info->scrub_lock);
1217 wake_up(&fs_info->scrub_pause_wait);
831 1218
832 /* 1219 /*
833 * collect all data csums for the stripe to avoid seeking during 1220 * collect all data csums for the stripe to avoid seeking during
834 * the scrub. This might currently (crc32) end up to be about 1MB 1221 * the scrub. This might currently (crc32) end up to be about 1MB
835 */ 1222 */
836 start_stripe = 0;
837 blk_start_plug(&plug); 1223 blk_start_plug(&plug);
838again:
839 logical = base + offset + start_stripe * increment;
840 for (i = start_stripe; i < nstripes; ++i) {
841 ret = btrfs_lookup_csums_range(csum_root, logical,
842 logical + map->stripe_len - 1,
843 &sdev->csum_list, 1);
844 if (ret)
845 goto out;
846 1224
847 logical += increment;
848 cond_resched();
849 }
850 /* 1225 /*
851 * now find all extents for each stripe and scrub them 1226 * now find all extents for each stripe and scrub them
852 */ 1227 */
853 logical = base + offset + start_stripe * increment; 1228 logical = base + offset;
854 physical = map->stripes[num].physical + start_stripe * map->stripe_len; 1229 physical = map->stripes[num].physical;
855 ret = 0; 1230 ret = 0;
856 for (i = start_stripe; i < nstripes; ++i) { 1231 for (i = 0; i < nstripes; ++i) {
857 /* 1232 /*
858 * canceled? 1233 * canceled?
859 */ 1234 */
@@ -882,11 +1257,14 @@ again:
882 atomic_dec(&fs_info->scrubs_paused); 1257 atomic_dec(&fs_info->scrubs_paused);
883 mutex_unlock(&fs_info->scrub_lock); 1258 mutex_unlock(&fs_info->scrub_lock);
884 wake_up(&fs_info->scrub_pause_wait); 1259 wake_up(&fs_info->scrub_pause_wait);
885 scrub_free_csums(sdev);
886 start_stripe = i;
887 goto again;
888 } 1260 }
889 1261
1262 ret = btrfs_lookup_csums_range(csum_root, logical,
1263 logical + map->stripe_len - 1,
1264 &sdev->csum_list, 1);
1265 if (ret)
1266 goto out;
1267
890 key.objectid = logical; 1268 key.objectid = logical;
891 key.type = BTRFS_EXTENT_ITEM_KEY; 1269 key.type = BTRFS_EXTENT_ITEM_KEY;
892 key.offset = (u64)0; 1270 key.offset = (u64)0;
@@ -982,7 +1360,6 @@ next:
982 1360
983out: 1361out:
984 blk_finish_plug(&plug); 1362 blk_finish_plug(&plug);
985out_noplug:
986 btrfs_free_path(path); 1363 btrfs_free_path(path);
987 return ret < 0 ? ret : 0; 1364 return ret < 0 ? ret : 0;
988} 1365}
@@ -1253,10 +1630,11 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
1253 ret = scrub_enumerate_chunks(sdev, start, end); 1630 ret = scrub_enumerate_chunks(sdev, start, end);
1254 1631
1255 wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); 1632 wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
1256
1257 atomic_dec(&fs_info->scrubs_running); 1633 atomic_dec(&fs_info->scrubs_running);
1258 wake_up(&fs_info->scrub_pause_wait); 1634 wake_up(&fs_info->scrub_pause_wait);
1259 1635
1636 wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0);
1637
1260 if (progress) 1638 if (progress)
1261 memcpy(progress, &sdev->stat, sizeof(*progress)); 1639 memcpy(progress, &sdev->stat, sizeof(*progress));
1262 1640