diff options
author | Jan Schmidt <list.btrfs@jan-o-sch.net> | 2011-06-13 14:04:15 -0400 |
---|---|---|
committer | Jan Schmidt <list.btrfs@jan-o-sch.net> | 2011-09-29 06:54:28 -0400 |
commit | 0ef8e45158f97dde4801b535e25f70f7caf01a27 (patch) | |
tree | d1c29055e475402613d529738225df79d8789d20 /fs/btrfs/scrub.c | |
parent | e12fa9cd390f8e93a9144bd99bd6f6ed316fbc1e (diff) |
btrfs scrub: add fixup code for errors on nodatasum files
This removes a FIXME comment and introduces the first part of nodatasum
fixup: It gets the corresponding inode for a logical address and triggers a
regular readpage for the corrupted sector.
Once we have on-the-fly error correction our error will be automatically
corrected. The correction code is expected to clear the newly introduced
EXTENT_DAMAGED flag, making scrub report that error as "corrected" instead
of "uncorrectable" eventually.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Diffstat (limited to 'fs/btrfs/scrub.c')
-rw-r--r-- | fs/btrfs/scrub.c | 188 |
1 files changed, 182 insertions, 6 deletions
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 41a01147b959..db09f01c0e4f 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c | |||
@@ -22,6 +22,7 @@ | |||
22 | #include "volumes.h" | 22 | #include "volumes.h" |
23 | #include "disk-io.h" | 23 | #include "disk-io.h" |
24 | #include "ordered-data.h" | 24 | #include "ordered-data.h" |
25 | #include "transaction.h" | ||
25 | #include "backref.h" | 26 | #include "backref.h" |
26 | 27 | ||
27 | /* | 28 | /* |
@@ -89,6 +90,7 @@ struct scrub_dev { | |||
89 | int first_free; | 90 | int first_free; |
90 | int curr; | 91 | int curr; |
91 | atomic_t in_flight; | 92 | atomic_t in_flight; |
93 | atomic_t fixup_cnt; | ||
92 | spinlock_t list_lock; | 94 | spinlock_t list_lock; |
93 | wait_queue_head_t list_wait; | 95 | wait_queue_head_t list_wait; |
94 | u16 csum_size; | 96 | u16 csum_size; |
@@ -102,6 +104,14 @@ struct scrub_dev { | |||
102 | spinlock_t stat_lock; | 104 | spinlock_t stat_lock; |
103 | }; | 105 | }; |
104 | 106 | ||
107 | struct scrub_fixup_nodatasum { | ||
108 | struct scrub_dev *sdev; | ||
109 | u64 logical; | ||
110 | struct btrfs_root *root; | ||
111 | struct btrfs_work work; | ||
112 | int mirror_num; | ||
113 | }; | ||
114 | |||
105 | struct scrub_warning { | 115 | struct scrub_warning { |
106 | struct btrfs_path *path; | 116 | struct btrfs_path *path; |
107 | u64 extent_item_size; | 117 | u64 extent_item_size; |
@@ -190,12 +200,13 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) | |||
190 | 200 | ||
191 | if (i != SCRUB_BIOS_PER_DEV-1) | 201 | if (i != SCRUB_BIOS_PER_DEV-1) |
192 | sdev->bios[i]->next_free = i + 1; | 202 | sdev->bios[i]->next_free = i + 1; |
193 | else | 203 | else |
194 | sdev->bios[i]->next_free = -1; | 204 | sdev->bios[i]->next_free = -1; |
195 | } | 205 | } |
196 | sdev->first_free = 0; | 206 | sdev->first_free = 0; |
197 | sdev->curr = -1; | 207 | sdev->curr = -1; |
198 | atomic_set(&sdev->in_flight, 0); | 208 | atomic_set(&sdev->in_flight, 0); |
209 | atomic_set(&sdev->fixup_cnt, 0); | ||
199 | atomic_set(&sdev->cancel_req, 0); | 210 | atomic_set(&sdev->cancel_req, 0); |
200 | sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy); | 211 | sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy); |
201 | INIT_LIST_HEAD(&sdev->csum_list); | 212 | INIT_LIST_HEAD(&sdev->csum_list); |
@@ -347,6 +358,151 @@ out: | |||
347 | kfree(swarn.msg_buf); | 358 | kfree(swarn.msg_buf); |
348 | } | 359 | } |
349 | 360 | ||
361 | static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx) | ||
362 | { | ||
363 | struct page *page; | ||
364 | unsigned long index; | ||
365 | struct scrub_fixup_nodatasum *fixup = ctx; | ||
366 | int ret; | ||
367 | int corrected; | ||
368 | struct btrfs_key key; | ||
369 | struct inode *inode; | ||
370 | u64 end = offset + PAGE_SIZE - 1; | ||
371 | struct btrfs_root *local_root; | ||
372 | |||
373 | key.objectid = root; | ||
374 | key.type = BTRFS_ROOT_ITEM_KEY; | ||
375 | key.offset = (u64)-1; | ||
376 | local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key); | ||
377 | if (IS_ERR(local_root)) | ||
378 | return PTR_ERR(local_root); | ||
379 | |||
380 | key.type = BTRFS_INODE_ITEM_KEY; | ||
381 | key.objectid = inum; | ||
382 | key.offset = 0; | ||
383 | inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL); | ||
384 | if (IS_ERR(inode)) | ||
385 | return PTR_ERR(inode); | ||
386 | |||
387 | ret = set_extent_bit(&BTRFS_I(inode)->io_tree, offset, end, | ||
388 | EXTENT_DAMAGED, 0, NULL, NULL, GFP_NOFS); | ||
389 | |||
390 | /* set_extent_bit should either succeed or give proper error */ | ||
391 | WARN_ON(ret > 0); | ||
392 | if (ret) | ||
393 | return ret < 0 ? ret : -EFAULT; | ||
394 | |||
395 | index = offset >> PAGE_CACHE_SHIFT; | ||
396 | |||
397 | page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); | ||
398 | if (!page) | ||
399 | return -ENOMEM; | ||
400 | |||
401 | ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page, | ||
402 | btrfs_get_extent, fixup->mirror_num); | ||
403 | wait_on_page_locked(page); | ||
404 | corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset, end, | ||
405 | EXTENT_DAMAGED, 0, NULL); | ||
406 | |||
407 | if (corrected) | ||
408 | WARN_ON(!PageUptodate(page)); | ||
409 | else | ||
410 | clear_extent_bit(&BTRFS_I(inode)->io_tree, offset, end, | ||
411 | EXTENT_DAMAGED, 0, 0, NULL, GFP_NOFS); | ||
412 | |||
413 | put_page(page); | ||
414 | iput(inode); | ||
415 | |||
416 | if (ret < 0) | ||
417 | return ret; | ||
418 | |||
419 | if (ret == 0 && corrected) { | ||
420 | /* | ||
421 | * we only need to call readpage for one of the inodes belonging | ||
422 | * to this extent. so make iterate_extent_inodes stop | ||
423 | */ | ||
424 | return 1; | ||
425 | } | ||
426 | |||
427 | return -EIO; | ||
428 | } | ||
429 | |||
430 | static void scrub_fixup_nodatasum(struct btrfs_work *work) | ||
431 | { | ||
432 | int ret; | ||
433 | struct scrub_fixup_nodatasum *fixup; | ||
434 | struct scrub_dev *sdev; | ||
435 | struct btrfs_trans_handle *trans = NULL; | ||
436 | struct btrfs_fs_info *fs_info; | ||
437 | struct btrfs_path *path; | ||
438 | int uncorrectable = 0; | ||
439 | |||
440 | fixup = container_of(work, struct scrub_fixup_nodatasum, work); | ||
441 | sdev = fixup->sdev; | ||
442 | fs_info = fixup->root->fs_info; | ||
443 | |||
444 | path = btrfs_alloc_path(); | ||
445 | if (!path) { | ||
446 | spin_lock(&sdev->stat_lock); | ||
447 | ++sdev->stat.malloc_errors; | ||
448 | spin_unlock(&sdev->stat_lock); | ||
449 | uncorrectable = 1; | ||
450 | goto out; | ||
451 | } | ||
452 | |||
453 | trans = btrfs_join_transaction(fixup->root); | ||
454 | if (IS_ERR(trans)) { | ||
455 | uncorrectable = 1; | ||
456 | goto out; | ||
457 | } | ||
458 | |||
459 | /* | ||
460 | * the idea is to trigger a regular read through the standard path. we | ||
461 | * read a page from the (failed) logical address by specifying the | ||
462 | * corresponding copynum of the failed sector. thus, that readpage is | ||
463 | * expected to fail. | ||
464 | * that is the point where on-the-fly error correction will kick in | ||
465 | * (once it's finished) and rewrite the failed sector if a good copy | ||
466 | * can be found. | ||
467 | */ | ||
468 | ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info, | ||
469 | path, scrub_fixup_readpage, | ||
470 | fixup); | ||
471 | if (ret < 0) { | ||
472 | uncorrectable = 1; | ||
473 | goto out; | ||
474 | } | ||
475 | WARN_ON(ret != 1); | ||
476 | |||
477 | spin_lock(&sdev->stat_lock); | ||
478 | ++sdev->stat.corrected_errors; | ||
479 | spin_unlock(&sdev->stat_lock); | ||
480 | |||
481 | out: | ||
482 | if (trans && !IS_ERR(trans)) | ||
483 | btrfs_end_transaction(trans, fixup->root); | ||
484 | if (uncorrectable) { | ||
485 | spin_lock(&sdev->stat_lock); | ||
486 | ++sdev->stat.uncorrectable_errors; | ||
487 | spin_unlock(&sdev->stat_lock); | ||
488 | printk_ratelimited(KERN_ERR "btrfs: unable to fixup " | ||
489 | "(nodatasum) error at logical %llu\n", | ||
490 | fixup->logical); | ||
491 | } | ||
492 | |||
493 | btrfs_free_path(path); | ||
494 | kfree(fixup); | ||
495 | |||
496 | /* see caller why we're pretending to be paused in the scrub counters */ | ||
497 | mutex_lock(&fs_info->scrub_lock); | ||
498 | atomic_dec(&fs_info->scrubs_running); | ||
499 | atomic_dec(&fs_info->scrubs_paused); | ||
500 | mutex_unlock(&fs_info->scrub_lock); | ||
501 | atomic_dec(&sdev->fixup_cnt); | ||
502 | wake_up(&fs_info->scrub_pause_wait); | ||
503 | wake_up(&sdev->list_wait); | ||
504 | } | ||
505 | |||
350 | /* | 506 | /* |
351 | * scrub_recheck_error gets called when either verification of the page | 507 | * scrub_recheck_error gets called when either verification of the page |
352 | * failed or the bio failed to read, e.g. with EIO. In the latter case, | 508 | * failed or the bio failed to read, e.g. with EIO. In the latter case, |
@@ -417,6 +573,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) | |||
417 | struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; | 573 | struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; |
418 | struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; | 574 | struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; |
419 | struct btrfs_multi_bio *multi = NULL; | 575 | struct btrfs_multi_bio *multi = NULL; |
576 | struct scrub_fixup_nodatasum *fixup; | ||
420 | u64 logical = sbio->logical + ix * PAGE_SIZE; | 577 | u64 logical = sbio->logical + ix * PAGE_SIZE; |
421 | u64 length; | 578 | u64 length; |
422 | int i; | 579 | int i; |
@@ -425,12 +582,30 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) | |||
425 | 582 | ||
426 | if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) && | 583 | if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) && |
427 | (sbio->spag[ix].have_csum == 0)) { | 584 | (sbio->spag[ix].have_csum == 0)) { |
585 | fixup = kzalloc(sizeof(*fixup), GFP_NOFS); | ||
586 | if (!fixup) | ||
587 | goto uncorrectable; | ||
588 | fixup->sdev = sdev; | ||
589 | fixup->logical = logical; | ||
590 | fixup->root = fs_info->extent_root; | ||
591 | fixup->mirror_num = sbio->spag[ix].mirror_num; | ||
428 | /* | 592 | /* |
429 | * nodatasum, don't try to fix anything | 593 | * increment scrubs_running to prevent cancel requests from |
430 | * FIXME: we can do better, open the inode and trigger a | 594 | * completing as long as a fixup worker is running. we must also |
431 | * writeback | 595 | * increment scrubs_paused to prevent deadlocking on pause |
596 | * requests used for transactions commits (as the worker uses a | ||
597 | * transaction context). it is safe to regard the fixup worker | ||
598 | * as paused for all matters practical. effectively, we only | ||
599 | * avoid cancellation requests from completing. | ||
432 | */ | 600 | */ |
433 | goto uncorrectable; | 601 | mutex_lock(&fs_info->scrub_lock); |
602 | atomic_inc(&fs_info->scrubs_running); | ||
603 | atomic_inc(&fs_info->scrubs_paused); | ||
604 | mutex_unlock(&fs_info->scrub_lock); | ||
605 | atomic_inc(&sdev->fixup_cnt); | ||
606 | fixup->work.func = scrub_fixup_nodatasum; | ||
607 | btrfs_queue_worker(&fs_info->scrub_workers, &fixup->work); | ||
608 | return; | ||
434 | } | 609 | } |
435 | 610 | ||
436 | length = PAGE_SIZE; | 611 | length = PAGE_SIZE; |
@@ -1425,10 +1600,11 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, | |||
1425 | ret = scrub_enumerate_chunks(sdev, start, end); | 1600 | ret = scrub_enumerate_chunks(sdev, start, end); |
1426 | 1601 | ||
1427 | wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); | 1602 | wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); |
1428 | |||
1429 | atomic_dec(&fs_info->scrubs_running); | 1603 | atomic_dec(&fs_info->scrubs_running); |
1430 | wake_up(&fs_info->scrub_pause_wait); | 1604 | wake_up(&fs_info->scrub_pause_wait); |
1431 | 1605 | ||
1606 | wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0); | ||
1607 | |||
1432 | if (progress) | 1608 | if (progress) |
1433 | memcpy(progress, &sdev->stat, sizeof(*progress)); | 1609 | memcpy(progress, &sdev->stat, sizeof(*progress)); |
1434 | 1610 | ||