aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/scrub.c
diff options
context:
space:
mode:
authorArne Jansen <sensille@gmx.net>2011-06-10 06:39:23 -0400
committerArne Jansen <sensille@gmx.net>2011-10-02 02:48:45 -0400
commit7a26285eea8eb92e0088db011571d887d4551b0f (patch)
tree2165dcb75f2ebbae358fc54aebaa49e7337c317f /fs/btrfs/scrub.c
parent4bb31e928d1a47f5bd046ecb176b8eff7c589fc0 (diff)
btrfs: use readahead API for scrub
Scrub uses a simple tree-enumeration to bring the relevant portions of the extent- and csum-tree into the page cache before starting the scrub-I/O. This is now replaced by using the new readahead-API. During readahead the scrub is being accounted as paused, so it won't hold off transaction commits. This change raises the average disk bandwith utilisation on my test volume from 70% to 90%. On another volume, the time for a test run went down from 89s to 43s. Changes v5: - reada1/2 are now of type struct reada_control * Signed-off-by: Arne Jansen <sensille@gmx.net>
Diffstat (limited to 'fs/btrfs/scrub.c')
-rw-r--r--fs/btrfs/scrub.c112
1 files changed, 50 insertions, 62 deletions
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index a8d03d5efb5d..f930f2776589 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -29,15 +29,12 @@
29 * any can be found. 29 * any can be found.
30 * 30 *
31 * Future enhancements: 31 * Future enhancements:
32 * - To enhance the performance, better read-ahead strategies for the
33 * extent-tree can be employed.
34 * - In case an unrepairable extent is encountered, track which files are 32 * - In case an unrepairable extent is encountered, track which files are
35 * affected and report them 33 * affected and report them
36 * - In case of a read error on files with nodatasum, map the file and read 34 * - In case of a read error on files with nodatasum, map the file and read
37 * the extent to trigger a writeback of the good copy 35 * the extent to trigger a writeback of the good copy
38 * - track and record media errors, throw out bad devices 36 * - track and record media errors, throw out bad devices
39 * - add a mode to also read unallocated space 37 * - add a mode to also read unallocated space
40 * - make the prefetch cancellable
41 */ 38 */
42 39
43struct scrub_bio; 40struct scrub_bio;
@@ -741,13 +738,16 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
741 int slot; 738 int slot;
742 int i; 739 int i;
743 u64 nstripes; 740 u64 nstripes;
744 int start_stripe;
745 struct extent_buffer *l; 741 struct extent_buffer *l;
746 struct btrfs_key key; 742 struct btrfs_key key;
747 u64 physical; 743 u64 physical;
748 u64 logical; 744 u64 logical;
749 u64 generation; 745 u64 generation;
750 u64 mirror_num; 746 u64 mirror_num;
747 struct reada_control *reada1;
748 struct reada_control *reada2;
749 struct btrfs_key key_start;
750 struct btrfs_key key_end;
751 751
752 u64 increment = map->stripe_len; 752 u64 increment = map->stripe_len;
753 u64 offset; 753 u64 offset;
@@ -779,81 +779,67 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
779 if (!path) 779 if (!path)
780 return -ENOMEM; 780 return -ENOMEM;
781 781
782 path->reada = 2;
783 path->search_commit_root = 1; 782 path->search_commit_root = 1;
784 path->skip_locking = 1; 783 path->skip_locking = 1;
785 784
786 /* 785 /*
787 * find all extents for each stripe and just read them to get 786 * trigger the readahead for extent tree csum tree and wait for
788 * them into the page cache 787 * completion. During readahead, the scrub is officially paused
789 * FIXME: we can do better. build a more intelligent prefetching 788 * to not hold off transaction commits
790 */ 789 */
791 logical = base + offset; 790 logical = base + offset;
792 physical = map->stripes[num].physical;
793 ret = 0;
794 for (i = 0; i < nstripes; ++i) {
795 key.objectid = logical;
796 key.type = BTRFS_EXTENT_ITEM_KEY;
797 key.offset = (u64)0;
798
799 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
800 if (ret < 0)
801 goto out_noplug;
802
803 /*
804 * we might miss half an extent here, but that doesn't matter,
805 * as it's only the prefetch
806 */
807 while (1) {
808 l = path->nodes[0];
809 slot = path->slots[0];
810 if (slot >= btrfs_header_nritems(l)) {
811 ret = btrfs_next_leaf(root, path);
812 if (ret == 0)
813 continue;
814 if (ret < 0)
815 goto out_noplug;
816 791
817 break; 792 wait_event(sdev->list_wait,
818 } 793 atomic_read(&sdev->in_flight) == 0);
819 btrfs_item_key_to_cpu(l, &key, slot); 794 atomic_inc(&fs_info->scrubs_paused);
795 wake_up(&fs_info->scrub_pause_wait);
820 796
821 if (key.objectid >= logical + map->stripe_len) 797 /* FIXME it might be better to start readahead at commit root */
822 break; 798 key_start.objectid = logical;
799 key_start.type = BTRFS_EXTENT_ITEM_KEY;
800 key_start.offset = (u64)0;
801 key_end.objectid = base + offset + nstripes * increment;
802 key_end.type = BTRFS_EXTENT_ITEM_KEY;
803 key_end.offset = (u64)0;
804 reada1 = btrfs_reada_add(root, &key_start, &key_end);
805
806 key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
807 key_start.type = BTRFS_EXTENT_CSUM_KEY;
808 key_start.offset = logical;
809 key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
810 key_end.type = BTRFS_EXTENT_CSUM_KEY;
811 key_end.offset = base + offset + nstripes * increment;
812 reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
813
814 if (!IS_ERR(reada1))
815 btrfs_reada_wait(reada1);
816 if (!IS_ERR(reada2))
817 btrfs_reada_wait(reada2);
823 818
824 path->slots[0]++; 819 mutex_lock(&fs_info->scrub_lock);
825 } 820 while (atomic_read(&fs_info->scrub_pause_req)) {
826 btrfs_release_path(path); 821 mutex_unlock(&fs_info->scrub_lock);
827 logical += increment; 822 wait_event(fs_info->scrub_pause_wait,
828 physical += map->stripe_len; 823 atomic_read(&fs_info->scrub_pause_req) == 0);
829 cond_resched(); 824 mutex_lock(&fs_info->scrub_lock);
830 } 825 }
826 atomic_dec(&fs_info->scrubs_paused);
827 mutex_unlock(&fs_info->scrub_lock);
828 wake_up(&fs_info->scrub_pause_wait);
831 829
832 /* 830 /*
833 * collect all data csums for the stripe to avoid seeking during 831 * collect all data csums for the stripe to avoid seeking during
834 * the scrub. This might currently (crc32) end up to be about 1MB 832 * the scrub. This might currently (crc32) end up to be about 1MB
835 */ 833 */
836 start_stripe = 0;
837 blk_start_plug(&plug); 834 blk_start_plug(&plug);
838again:
839 logical = base + offset + start_stripe * increment;
840 for (i = start_stripe; i < nstripes; ++i) {
841 ret = btrfs_lookup_csums_range(csum_root, logical,
842 logical + map->stripe_len - 1,
843 &sdev->csum_list, 1);
844 if (ret)
845 goto out;
846 835
847 logical += increment;
848 cond_resched();
849 }
850 /* 836 /*
851 * now find all extents for each stripe and scrub them 837 * now find all extents for each stripe and scrub them
852 */ 838 */
853 logical = base + offset + start_stripe * increment; 839 logical = base + offset;
854 physical = map->stripes[num].physical + start_stripe * map->stripe_len; 840 physical = map->stripes[num].physical;
855 ret = 0; 841 ret = 0;
856 for (i = start_stripe; i < nstripes; ++i) { 842 for (i = 0; i < nstripes; ++i) {
857 /* 843 /*
858 * canceled? 844 * canceled?
859 */ 845 */
@@ -882,11 +868,14 @@ again:
882 atomic_dec(&fs_info->scrubs_paused); 868 atomic_dec(&fs_info->scrubs_paused);
883 mutex_unlock(&fs_info->scrub_lock); 869 mutex_unlock(&fs_info->scrub_lock);
884 wake_up(&fs_info->scrub_pause_wait); 870 wake_up(&fs_info->scrub_pause_wait);
885 scrub_free_csums(sdev);
886 start_stripe = i;
887 goto again;
888 } 871 }
889 872
873 ret = btrfs_lookup_csums_range(csum_root, logical,
874 logical + map->stripe_len - 1,
875 &sdev->csum_list, 1);
876 if (ret)
877 goto out;
878
890 key.objectid = logical; 879 key.objectid = logical;
891 key.type = BTRFS_EXTENT_ITEM_KEY; 880 key.type = BTRFS_EXTENT_ITEM_KEY;
892 key.offset = (u64)0; 881 key.offset = (u64)0;
@@ -982,7 +971,6 @@ next:
982 971
983out: 972out:
984 blk_finish_plug(&plug); 973 blk_finish_plug(&plug);
985out_noplug:
986 btrfs_free_path(path); 974 btrfs_free_path(path);
987 return ret < 0 ? ret : 0; 975 return ret < 0 ? ret : 0;
988} 976}