diff options
author | Arne Jansen <sensille@gmx.net> | 2011-06-10 06:39:23 -0400 |
---|---|---|
committer | Arne Jansen <sensille@gmx.net> | 2011-10-02 02:48:45 -0400 |
commit | 7a26285eea8eb92e0088db011571d887d4551b0f (patch) | |
tree | 2165dcb75f2ebbae358fc54aebaa49e7337c317f /fs | |
parent | 4bb31e928d1a47f5bd046ecb176b8eff7c589fc0 (diff) |
btrfs: use readahead API for scrub
Scrub uses a simple tree-enumeration to bring the relevant portions
of the extent- and csum-tree into the page cache before starting the
scrub-I/O. This is now replaced by using the new readahead-API.
During readahead the scrub is being accounted as paused, so it won't
hold off transaction commits.
This change raises the average disk bandwith utilisation on my test
volume from 70% to 90%. On another volume, the time for a test run
went down from 89s to 43s.
Changes v5:
- reada1/2 are now of type struct reada_control *
Signed-off-by: Arne Jansen <sensille@gmx.net>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/btrfs/scrub.c | 112 |
1 files changed, 50 insertions, 62 deletions
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index a8d03d5efb5d..f930f2776589 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c | |||
@@ -29,15 +29,12 @@ | |||
29 | * any can be found. | 29 | * any can be found. |
30 | * | 30 | * |
31 | * Future enhancements: | 31 | * Future enhancements: |
32 | * - To enhance the performance, better read-ahead strategies for the | ||
33 | * extent-tree can be employed. | ||
34 | * - In case an unrepairable extent is encountered, track which files are | 32 | * - In case an unrepairable extent is encountered, track which files are |
35 | * affected and report them | 33 | * affected and report them |
36 | * - In case of a read error on files with nodatasum, map the file and read | 34 | * - In case of a read error on files with nodatasum, map the file and read |
37 | * the extent to trigger a writeback of the good copy | 35 | * the extent to trigger a writeback of the good copy |
38 | * - track and record media errors, throw out bad devices | 36 | * - track and record media errors, throw out bad devices |
39 | * - add a mode to also read unallocated space | 37 | * - add a mode to also read unallocated space |
40 | * - make the prefetch cancellable | ||
41 | */ | 38 | */ |
42 | 39 | ||
43 | struct scrub_bio; | 40 | struct scrub_bio; |
@@ -741,13 +738,16 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, | |||
741 | int slot; | 738 | int slot; |
742 | int i; | 739 | int i; |
743 | u64 nstripes; | 740 | u64 nstripes; |
744 | int start_stripe; | ||
745 | struct extent_buffer *l; | 741 | struct extent_buffer *l; |
746 | struct btrfs_key key; | 742 | struct btrfs_key key; |
747 | u64 physical; | 743 | u64 physical; |
748 | u64 logical; | 744 | u64 logical; |
749 | u64 generation; | 745 | u64 generation; |
750 | u64 mirror_num; | 746 | u64 mirror_num; |
747 | struct reada_control *reada1; | ||
748 | struct reada_control *reada2; | ||
749 | struct btrfs_key key_start; | ||
750 | struct btrfs_key key_end; | ||
751 | 751 | ||
752 | u64 increment = map->stripe_len; | 752 | u64 increment = map->stripe_len; |
753 | u64 offset; | 753 | u64 offset; |
@@ -779,81 +779,67 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, | |||
779 | if (!path) | 779 | if (!path) |
780 | return -ENOMEM; | 780 | return -ENOMEM; |
781 | 781 | ||
782 | path->reada = 2; | ||
783 | path->search_commit_root = 1; | 782 | path->search_commit_root = 1; |
784 | path->skip_locking = 1; | 783 | path->skip_locking = 1; |
785 | 784 | ||
786 | /* | 785 | /* |
787 | * find all extents for each stripe and just read them to get | 786 | * trigger the readahead for extent tree csum tree and wait for |
788 | * them into the page cache | 787 | * completion. During readahead, the scrub is officially paused |
789 | * FIXME: we can do better. build a more intelligent prefetching | 788 | * to not hold off transaction commits |
790 | */ | 789 | */ |
791 | logical = base + offset; | 790 | logical = base + offset; |
792 | physical = map->stripes[num].physical; | ||
793 | ret = 0; | ||
794 | for (i = 0; i < nstripes; ++i) { | ||
795 | key.objectid = logical; | ||
796 | key.type = BTRFS_EXTENT_ITEM_KEY; | ||
797 | key.offset = (u64)0; | ||
798 | |||
799 | ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); | ||
800 | if (ret < 0) | ||
801 | goto out_noplug; | ||
802 | |||
803 | /* | ||
804 | * we might miss half an extent here, but that doesn't matter, | ||
805 | * as it's only the prefetch | ||
806 | */ | ||
807 | while (1) { | ||
808 | l = path->nodes[0]; | ||
809 | slot = path->slots[0]; | ||
810 | if (slot >= btrfs_header_nritems(l)) { | ||
811 | ret = btrfs_next_leaf(root, path); | ||
812 | if (ret == 0) | ||
813 | continue; | ||
814 | if (ret < 0) | ||
815 | goto out_noplug; | ||
816 | 791 | ||
817 | break; | 792 | wait_event(sdev->list_wait, |
818 | } | 793 | atomic_read(&sdev->in_flight) == 0); |
819 | btrfs_item_key_to_cpu(l, &key, slot); | 794 | atomic_inc(&fs_info->scrubs_paused); |
795 | wake_up(&fs_info->scrub_pause_wait); | ||
820 | 796 | ||
821 | if (key.objectid >= logical + map->stripe_len) | 797 | /* FIXME it might be better to start readahead at commit root */ |
822 | break; | 798 | key_start.objectid = logical; |
799 | key_start.type = BTRFS_EXTENT_ITEM_KEY; | ||
800 | key_start.offset = (u64)0; | ||
801 | key_end.objectid = base + offset + nstripes * increment; | ||
802 | key_end.type = BTRFS_EXTENT_ITEM_KEY; | ||
803 | key_end.offset = (u64)0; | ||
804 | reada1 = btrfs_reada_add(root, &key_start, &key_end); | ||
805 | |||
806 | key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID; | ||
807 | key_start.type = BTRFS_EXTENT_CSUM_KEY; | ||
808 | key_start.offset = logical; | ||
809 | key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID; | ||
810 | key_end.type = BTRFS_EXTENT_CSUM_KEY; | ||
811 | key_end.offset = base + offset + nstripes * increment; | ||
812 | reada2 = btrfs_reada_add(csum_root, &key_start, &key_end); | ||
813 | |||
814 | if (!IS_ERR(reada1)) | ||
815 | btrfs_reada_wait(reada1); | ||
816 | if (!IS_ERR(reada2)) | ||
817 | btrfs_reada_wait(reada2); | ||
823 | 818 | ||
824 | path->slots[0]++; | 819 | mutex_lock(&fs_info->scrub_lock); |
825 | } | 820 | while (atomic_read(&fs_info->scrub_pause_req)) { |
826 | btrfs_release_path(path); | 821 | mutex_unlock(&fs_info->scrub_lock); |
827 | logical += increment; | 822 | wait_event(fs_info->scrub_pause_wait, |
828 | physical += map->stripe_len; | 823 | atomic_read(&fs_info->scrub_pause_req) == 0); |
829 | cond_resched(); | 824 | mutex_lock(&fs_info->scrub_lock); |
830 | } | 825 | } |
826 | atomic_dec(&fs_info->scrubs_paused); | ||
827 | mutex_unlock(&fs_info->scrub_lock); | ||
828 | wake_up(&fs_info->scrub_pause_wait); | ||
831 | 829 | ||
832 | /* | 830 | /* |
833 | * collect all data csums for the stripe to avoid seeking during | 831 | * collect all data csums for the stripe to avoid seeking during |
834 | * the scrub. This might currently (crc32) end up to be about 1MB | 832 | * the scrub. This might currently (crc32) end up to be about 1MB |
835 | */ | 833 | */ |
836 | start_stripe = 0; | ||
837 | blk_start_plug(&plug); | 834 | blk_start_plug(&plug); |
838 | again: | ||
839 | logical = base + offset + start_stripe * increment; | ||
840 | for (i = start_stripe; i < nstripes; ++i) { | ||
841 | ret = btrfs_lookup_csums_range(csum_root, logical, | ||
842 | logical + map->stripe_len - 1, | ||
843 | &sdev->csum_list, 1); | ||
844 | if (ret) | ||
845 | goto out; | ||
846 | 835 | ||
847 | logical += increment; | ||
848 | cond_resched(); | ||
849 | } | ||
850 | /* | 836 | /* |
851 | * now find all extents for each stripe and scrub them | 837 | * now find all extents for each stripe and scrub them |
852 | */ | 838 | */ |
853 | logical = base + offset + start_stripe * increment; | 839 | logical = base + offset; |
854 | physical = map->stripes[num].physical + start_stripe * map->stripe_len; | 840 | physical = map->stripes[num].physical; |
855 | ret = 0; | 841 | ret = 0; |
856 | for (i = start_stripe; i < nstripes; ++i) { | 842 | for (i = 0; i < nstripes; ++i) { |
857 | /* | 843 | /* |
858 | * canceled? | 844 | * canceled? |
859 | */ | 845 | */ |
@@ -882,11 +868,14 @@ again: | |||
882 | atomic_dec(&fs_info->scrubs_paused); | 868 | atomic_dec(&fs_info->scrubs_paused); |
883 | mutex_unlock(&fs_info->scrub_lock); | 869 | mutex_unlock(&fs_info->scrub_lock); |
884 | wake_up(&fs_info->scrub_pause_wait); | 870 | wake_up(&fs_info->scrub_pause_wait); |
885 | scrub_free_csums(sdev); | ||
886 | start_stripe = i; | ||
887 | goto again; | ||
888 | } | 871 | } |
889 | 872 | ||
873 | ret = btrfs_lookup_csums_range(csum_root, logical, | ||
874 | logical + map->stripe_len - 1, | ||
875 | &sdev->csum_list, 1); | ||
876 | if (ret) | ||
877 | goto out; | ||
878 | |||
890 | key.objectid = logical; | 879 | key.objectid = logical; |
891 | key.type = BTRFS_EXTENT_ITEM_KEY; | 880 | key.type = BTRFS_EXTENT_ITEM_KEY; |
892 | key.offset = (u64)0; | 881 | key.offset = (u64)0; |
@@ -982,7 +971,6 @@ next: | |||
982 | 971 | ||
983 | out: | 972 | out: |
984 | blk_finish_plug(&plug); | 973 | blk_finish_plug(&plug); |
985 | out_noplug: | ||
986 | btrfs_free_path(path); | 974 | btrfs_free_path(path); |
987 | return ret < 0 ? ret : 0; | 975 | return ret < 0 ? ret : 0; |
988 | } | 976 | } |