aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
authorChris Mason <clm@fb.com>2014-12-02 21:42:03 -0500
committerChris Mason <clm@fb.com>2014-12-02 21:42:03 -0500
commit9627aeee3e203e30679549e4962633698a6bf87f (patch)
tree30ee313a7049bf3fcc17e346df5737e967fd9a95 /fs/btrfs
parentcb83b7b81698a4abe531e0ba18b9e288b06947ce (diff)
parent5d3edd8f44aac94de7b16f4c54290e24f5e8c532 (diff)
Merge branch 'raid56-scrub-replace' of git://github.com/miaoxie/linux-btrfs into for-linus
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/ctree.c14
-rw-r--r--fs/btrfs/ctree.h7
-rw-r--r--fs/btrfs/dev-replace.c9
-rw-r--r--fs/btrfs/locking.c24
-rw-r--r--fs/btrfs/locking.h2
-rw-r--r--fs/btrfs/raid56.c763
-rw-r--r--fs/btrfs/raid56.h16
-rw-r--r--fs/btrfs/scrub.c803
-rw-r--r--fs/btrfs/volumes.c52
-rw-r--r--fs/btrfs/volumes.h14
10 files changed, 1556 insertions, 148 deletions
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 817234168a7f..14a72ed14ef7 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -80,13 +80,6 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
80{ 80{
81 int i; 81 int i;
82 82
83#ifdef CONFIG_DEBUG_LOCK_ALLOC
84 /* lockdep really cares that we take all of these spinlocks
85 * in the right order. If any of the locks in the path are not
86 * currently blocking, it is going to complain. So, make really
87 * really sure by forcing the path to blocking before we clear
88 * the path blocking.
89 */
90 if (held) { 83 if (held) {
91 btrfs_set_lock_blocking_rw(held, held_rw); 84 btrfs_set_lock_blocking_rw(held, held_rw);
92 if (held_rw == BTRFS_WRITE_LOCK) 85 if (held_rw == BTRFS_WRITE_LOCK)
@@ -95,7 +88,6 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
95 held_rw = BTRFS_READ_LOCK_BLOCKING; 88 held_rw = BTRFS_READ_LOCK_BLOCKING;
96 } 89 }
97 btrfs_set_path_blocking(p); 90 btrfs_set_path_blocking(p);
98#endif
99 91
100 for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) { 92 for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
101 if (p->nodes[i] && p->locks[i]) { 93 if (p->nodes[i] && p->locks[i]) {
@@ -107,10 +99,8 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
107 } 99 }
108 } 100 }
109 101
110#ifdef CONFIG_DEBUG_LOCK_ALLOC
111 if (held) 102 if (held)
112 btrfs_clear_lock_blocking_rw(held, held_rw); 103 btrfs_clear_lock_blocking_rw(held, held_rw);
113#endif
114} 104}
115 105
116/* this also releases the path */ 106/* this also releases the path */
@@ -2893,7 +2883,7 @@ cow_done:
2893 } 2883 }
2894 p->locks[level] = BTRFS_WRITE_LOCK; 2884 p->locks[level] = BTRFS_WRITE_LOCK;
2895 } else { 2885 } else {
2896 err = btrfs_try_tree_read_lock(b); 2886 err = btrfs_tree_read_lock_atomic(b);
2897 if (!err) { 2887 if (!err) {
2898 btrfs_set_path_blocking(p); 2888 btrfs_set_path_blocking(p);
2899 btrfs_tree_read_lock(b); 2889 btrfs_tree_read_lock(b);
@@ -3025,7 +3015,7 @@ again:
3025 } 3015 }
3026 3016
3027 level = btrfs_header_level(b); 3017 level = btrfs_header_level(b);
3028 err = btrfs_try_tree_read_lock(b); 3018 err = btrfs_tree_read_lock_atomic(b);
3029 if (!err) { 3019 if (!err) {
3030 btrfs_set_path_blocking(p); 3020 btrfs_set_path_blocking(p);
3031 btrfs_tree_read_lock(b); 3021 btrfs_tree_read_lock(b);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index d71915e04e92..e6fbbd74b716 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -4167,7 +4167,12 @@ int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
4167/* dev-replace.c */ 4167/* dev-replace.c */
4168void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info); 4168void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info);
4169void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info); 4169void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info);
4170void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info); 4170void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount);
4171
4172static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
4173{
4174 btrfs_bio_counter_sub(fs_info, 1);
4175}
4171 4176
4172/* reada.c */ 4177/* reada.c */
4173struct reada_control { 4178struct reada_control {
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 3fbd0628620b..ca6a3a3b6b6c 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -316,11 +316,6 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
316 struct btrfs_device *tgt_device = NULL; 316 struct btrfs_device *tgt_device = NULL;
317 struct btrfs_device *src_device = NULL; 317 struct btrfs_device *src_device = NULL;
318 318
319 if (btrfs_fs_incompat(fs_info, RAID56)) {
320 btrfs_warn(fs_info, "dev_replace cannot yet handle RAID5/RAID6");
321 return -EOPNOTSUPP;
322 }
323
324 switch (args->start.cont_reading_from_srcdev_mode) { 319 switch (args->start.cont_reading_from_srcdev_mode) {
325 case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS: 320 case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
326 case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID: 321 case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
@@ -927,9 +922,9 @@ void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
927 percpu_counter_inc(&fs_info->bio_counter); 922 percpu_counter_inc(&fs_info->bio_counter);
928} 923}
929 924
930void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info) 925void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
931{ 926{
932 percpu_counter_dec(&fs_info->bio_counter); 927 percpu_counter_sub(&fs_info->bio_counter, amount);
933 928
934 if (waitqueue_active(&fs_info->replace_wait)) 929 if (waitqueue_active(&fs_info->replace_wait))
935 wake_up(&fs_info->replace_wait); 930 wake_up(&fs_info->replace_wait);
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 5665d2149249..f8229ef1b46d 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -128,6 +128,26 @@ again:
128} 128}
129 129
130/* 130/*
131 * take a spinning read lock.
132 * returns 1 if we get the read lock and 0 if we don't
133 * this won't wait for blocking writers
134 */
135int btrfs_tree_read_lock_atomic(struct extent_buffer *eb)
136{
137 if (atomic_read(&eb->blocking_writers))
138 return 0;
139
140 read_lock(&eb->lock);
141 if (atomic_read(&eb->blocking_writers)) {
142 read_unlock(&eb->lock);
143 return 0;
144 }
145 atomic_inc(&eb->read_locks);
146 atomic_inc(&eb->spinning_readers);
147 return 1;
148}
149
150/*
131 * returns 1 if we get the read lock and 0 if we don't 151 * returns 1 if we get the read lock and 0 if we don't
132 * this won't wait for blocking writers 152 * this won't wait for blocking writers
133 */ 153 */
@@ -158,9 +178,7 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
158 atomic_read(&eb->blocking_readers)) 178 atomic_read(&eb->blocking_readers))
159 return 0; 179 return 0;
160 180
161 if (!write_trylock(&eb->lock)) 181 write_lock(&eb->lock);
162 return 0;
163
164 if (atomic_read(&eb->blocking_writers) || 182 if (atomic_read(&eb->blocking_writers) ||
165 atomic_read(&eb->blocking_readers)) { 183 atomic_read(&eb->blocking_readers)) {
166 write_unlock(&eb->lock); 184 write_unlock(&eb->lock);
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index b81e0e9a4894..c44a9d5f5362 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -35,6 +35,8 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw);
35void btrfs_assert_tree_locked(struct extent_buffer *eb); 35void btrfs_assert_tree_locked(struct extent_buffer *eb);
36int btrfs_try_tree_read_lock(struct extent_buffer *eb); 36int btrfs_try_tree_read_lock(struct extent_buffer *eb);
37int btrfs_try_tree_write_lock(struct extent_buffer *eb); 37int btrfs_try_tree_write_lock(struct extent_buffer *eb);
38int btrfs_tree_read_lock_atomic(struct extent_buffer *eb);
39
38 40
39static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw) 41static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw)
40{ 42{
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 6a41631cb959..8ab2a17bbba8 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -58,9 +58,23 @@
58 */ 58 */
59#define RBIO_CACHE_READY_BIT 3 59#define RBIO_CACHE_READY_BIT 3
60 60
61/*
62 * bbio and raid_map is managed by the caller, so we shouldn't free
63 * them here. And besides that, all rbios with this flag should not
64 * be cached, because we need raid_map to check the rbios' stripe
65 * is the same or not, but it is very likely that the caller has
66 * free raid_map, so don't cache those rbios.
67 */
68#define RBIO_HOLD_BBIO_MAP_BIT 4
61 69
62#define RBIO_CACHE_SIZE 1024 70#define RBIO_CACHE_SIZE 1024
63 71
72enum btrfs_rbio_ops {
73 BTRFS_RBIO_WRITE = 0,
74 BTRFS_RBIO_READ_REBUILD = 1,
75 BTRFS_RBIO_PARITY_SCRUB = 2,
76};
77
64struct btrfs_raid_bio { 78struct btrfs_raid_bio {
65 struct btrfs_fs_info *fs_info; 79 struct btrfs_fs_info *fs_info;
66 struct btrfs_bio *bbio; 80 struct btrfs_bio *bbio;
@@ -117,13 +131,16 @@ struct btrfs_raid_bio {
117 /* number of data stripes (no p/q) */ 131 /* number of data stripes (no p/q) */
118 int nr_data; 132 int nr_data;
119 133
134 int real_stripes;
135
136 int stripe_npages;
120 /* 137 /*
121 * set if we're doing a parity rebuild 138 * set if we're doing a parity rebuild
122 * for a read from higher up, which is handled 139 * for a read from higher up, which is handled
123 * differently from a parity rebuild as part of 140 * differently from a parity rebuild as part of
124 * rmw 141 * rmw
125 */ 142 */
126 int read_rebuild; 143 enum btrfs_rbio_ops operation;
127 144
128 /* first bad stripe */ 145 /* first bad stripe */
129 int faila; 146 int faila;
@@ -131,6 +148,7 @@ struct btrfs_raid_bio {
131 /* second bad stripe (for raid6 use) */ 148 /* second bad stripe (for raid6 use) */
132 int failb; 149 int failb;
133 150
151 int scrubp;
134 /* 152 /*
135 * number of pages needed to represent the full 153 * number of pages needed to represent the full
136 * stripe 154 * stripe
@@ -144,8 +162,13 @@ struct btrfs_raid_bio {
144 */ 162 */
145 int bio_list_bytes; 163 int bio_list_bytes;
146 164
165 int generic_bio_cnt;
166
147 atomic_t refs; 167 atomic_t refs;
148 168
169 atomic_t stripes_pending;
170
171 atomic_t error;
149 /* 172 /*
150 * these are two arrays of pointers. We allocate the 173 * these are two arrays of pointers. We allocate the
151 * rbio big enough to hold them both and setup their 174 * rbio big enough to hold them both and setup their
@@ -162,6 +185,11 @@ struct btrfs_raid_bio {
162 * here for faster lookup 185 * here for faster lookup
163 */ 186 */
164 struct page **bio_pages; 187 struct page **bio_pages;
188
189 /*
190 * bitmap to record which horizontal stripe has data
191 */
192 unsigned long *dbitmap;
165}; 193};
166 194
167static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); 195static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
@@ -176,6 +204,10 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio);
176static void index_rbio_pages(struct btrfs_raid_bio *rbio); 204static void index_rbio_pages(struct btrfs_raid_bio *rbio);
177static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); 205static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
178 206
207static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
208 int need_check);
209static void async_scrub_parity(struct btrfs_raid_bio *rbio);
210
179/* 211/*
180 * the stripe hash table is used for locking, and to collect 212 * the stripe hash table is used for locking, and to collect
181 * bios in hopes of making a full stripe 213 * bios in hopes of making a full stripe
@@ -324,6 +356,7 @@ static void merge_rbio(struct btrfs_raid_bio *dest,
324{ 356{
325 bio_list_merge(&dest->bio_list, &victim->bio_list); 357 bio_list_merge(&dest->bio_list, &victim->bio_list);
326 dest->bio_list_bytes += victim->bio_list_bytes; 358 dest->bio_list_bytes += victim->bio_list_bytes;
359 dest->generic_bio_cnt += victim->generic_bio_cnt;
327 bio_list_init(&victim->bio_list); 360 bio_list_init(&victim->bio_list);
328} 361}
329 362
@@ -577,11 +610,20 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
577 cur->raid_map[0]) 610 cur->raid_map[0])
578 return 0; 611 return 0;
579 612
580 /* reads can't merge with writes */ 613 /* we can't merge with different operations */
581 if (last->read_rebuild != 614 if (last->operation != cur->operation)
582 cur->read_rebuild) { 615 return 0;
616 /*
617 * We've need read the full stripe from the drive.
618 * check and repair the parity and write the new results.
619 *
620 * We're not allowed to add any new bios to the
621 * bio list here, anyone else that wants to
622 * change this stripe needs to do their own rmw.
623 */
624 if (last->operation == BTRFS_RBIO_PARITY_SCRUB ||
625 cur->operation == BTRFS_RBIO_PARITY_SCRUB)
583 return 0; 626 return 0;
584 }
585 627
586 return 1; 628 return 1;
587} 629}
@@ -601,7 +643,7 @@ static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
601 */ 643 */
602static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) 644static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
603{ 645{
604 if (rbio->nr_data + 1 == rbio->bbio->num_stripes) 646 if (rbio->nr_data + 1 == rbio->real_stripes)
605 return NULL; 647 return NULL;
606 648
607 index += ((rbio->nr_data + 1) * rbio->stripe_len) >> 649 index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
@@ -772,11 +814,14 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
772 spin_unlock(&rbio->bio_list_lock); 814 spin_unlock(&rbio->bio_list_lock);
773 spin_unlock_irqrestore(&h->lock, flags); 815 spin_unlock_irqrestore(&h->lock, flags);
774 816
775 if (next->read_rebuild) 817 if (next->operation == BTRFS_RBIO_READ_REBUILD)
776 async_read_rebuild(next); 818 async_read_rebuild(next);
777 else { 819 else if (next->operation == BTRFS_RBIO_WRITE) {
778 steal_rbio(rbio, next); 820 steal_rbio(rbio, next);
779 async_rmw_stripe(next); 821 async_rmw_stripe(next);
822 } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
823 steal_rbio(rbio, next);
824 async_scrub_parity(next);
780 } 825 }
781 826
782 goto done_nolock; 827 goto done_nolock;
@@ -796,6 +841,21 @@ done_nolock:
796 remove_rbio_from_cache(rbio); 841 remove_rbio_from_cache(rbio);
797} 842}
798 843
844static inline void
845__free_bbio_and_raid_map(struct btrfs_bio *bbio, u64 *raid_map, int need)
846{
847 if (need) {
848 kfree(raid_map);
849 kfree(bbio);
850 }
851}
852
853static inline void free_bbio_and_raid_map(struct btrfs_raid_bio *rbio)
854{
855 __free_bbio_and_raid_map(rbio->bbio, rbio->raid_map,
856 !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags));
857}
858
799static void __free_raid_bio(struct btrfs_raid_bio *rbio) 859static void __free_raid_bio(struct btrfs_raid_bio *rbio)
800{ 860{
801 int i; 861 int i;
@@ -814,8 +874,9 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio)
814 rbio->stripe_pages[i] = NULL; 874 rbio->stripe_pages[i] = NULL;
815 } 875 }
816 } 876 }
817 kfree(rbio->raid_map); 877
818 kfree(rbio->bbio); 878 free_bbio_and_raid_map(rbio);
879
819 kfree(rbio); 880 kfree(rbio);
820} 881}
821 882
@@ -833,6 +894,10 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
833{ 894{
834 struct bio *cur = bio_list_get(&rbio->bio_list); 895 struct bio *cur = bio_list_get(&rbio->bio_list);
835 struct bio *next; 896 struct bio *next;
897
898 if (rbio->generic_bio_cnt)
899 btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt);
900
836 free_raid_bio(rbio); 901 free_raid_bio(rbio);
837 902
838 while (cur) { 903 while (cur) {
@@ -858,13 +923,13 @@ static void raid_write_end_io(struct bio *bio, int err)
858 923
859 bio_put(bio); 924 bio_put(bio);
860 925
861 if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) 926 if (!atomic_dec_and_test(&rbio->stripes_pending))
862 return; 927 return;
863 928
864 err = 0; 929 err = 0;
865 930
866 /* OK, we have read all the stripes we need to. */ 931 /* OK, we have read all the stripes we need to. */
867 if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) 932 if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
868 err = -EIO; 933 err = -EIO;
869 934
870 rbio_orig_end_io(rbio, err, 0); 935 rbio_orig_end_io(rbio, err, 0);
@@ -925,16 +990,16 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
925{ 990{
926 struct btrfs_raid_bio *rbio; 991 struct btrfs_raid_bio *rbio;
927 int nr_data = 0; 992 int nr_data = 0;
928 int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes); 993 int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
994 int num_pages = rbio_nr_pages(stripe_len, real_stripes);
995 int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
929 void *p; 996 void *p;
930 997
931 rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2, 998 rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2 +
999 DIV_ROUND_UP(stripe_npages, BITS_PER_LONG / 8),
932 GFP_NOFS); 1000 GFP_NOFS);
933 if (!rbio) { 1001 if (!rbio)
934 kfree(raid_map);
935 kfree(bbio);
936 return ERR_PTR(-ENOMEM); 1002 return ERR_PTR(-ENOMEM);
937 }
938 1003
939 bio_list_init(&rbio->bio_list); 1004 bio_list_init(&rbio->bio_list);
940 INIT_LIST_HEAD(&rbio->plug_list); 1005 INIT_LIST_HEAD(&rbio->plug_list);
@@ -946,9 +1011,13 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
946 rbio->fs_info = root->fs_info; 1011 rbio->fs_info = root->fs_info;
947 rbio->stripe_len = stripe_len; 1012 rbio->stripe_len = stripe_len;
948 rbio->nr_pages = num_pages; 1013 rbio->nr_pages = num_pages;
1014 rbio->real_stripes = real_stripes;
1015 rbio->stripe_npages = stripe_npages;
949 rbio->faila = -1; 1016 rbio->faila = -1;
950 rbio->failb = -1; 1017 rbio->failb = -1;
951 atomic_set(&rbio->refs, 1); 1018 atomic_set(&rbio->refs, 1);
1019 atomic_set(&rbio->error, 0);
1020 atomic_set(&rbio->stripes_pending, 0);
952 1021
953 /* 1022 /*
954 * the stripe_pages and bio_pages array point to the extra 1023 * the stripe_pages and bio_pages array point to the extra
@@ -957,11 +1026,12 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
957 p = rbio + 1; 1026 p = rbio + 1;
958 rbio->stripe_pages = p; 1027 rbio->stripe_pages = p;
959 rbio->bio_pages = p + sizeof(struct page *) * num_pages; 1028 rbio->bio_pages = p + sizeof(struct page *) * num_pages;
1029 rbio->dbitmap = p + sizeof(struct page *) * num_pages * 2;
960 1030
961 if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE) 1031 if (raid_map[real_stripes - 1] == RAID6_Q_STRIPE)
962 nr_data = bbio->num_stripes - 2; 1032 nr_data = real_stripes - 2;
963 else 1033 else
964 nr_data = bbio->num_stripes - 1; 1034 nr_data = real_stripes - 1;
965 1035
966 rbio->nr_data = nr_data; 1036 rbio->nr_data = nr_data;
967 return rbio; 1037 return rbio;
@@ -1073,7 +1143,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
1073static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) 1143static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
1074{ 1144{
1075 if (rbio->faila >= 0 || rbio->failb >= 0) { 1145 if (rbio->faila >= 0 || rbio->failb >= 0) {
1076 BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1); 1146 BUG_ON(rbio->faila == rbio->real_stripes - 1);
1077 __raid56_parity_recover(rbio); 1147 __raid56_parity_recover(rbio);
1078 } else { 1148 } else {
1079 finish_rmw(rbio); 1149 finish_rmw(rbio);
@@ -1134,7 +1204,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1134static noinline void finish_rmw(struct btrfs_raid_bio *rbio) 1204static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1135{ 1205{
1136 struct btrfs_bio *bbio = rbio->bbio; 1206 struct btrfs_bio *bbio = rbio->bbio;
1137 void *pointers[bbio->num_stripes]; 1207 void *pointers[rbio->real_stripes];
1138 int stripe_len = rbio->stripe_len; 1208 int stripe_len = rbio->stripe_len;
1139 int nr_data = rbio->nr_data; 1209 int nr_data = rbio->nr_data;
1140 int stripe; 1210 int stripe;
@@ -1148,11 +1218,11 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1148 1218
1149 bio_list_init(&bio_list); 1219 bio_list_init(&bio_list);
1150 1220
1151 if (bbio->num_stripes - rbio->nr_data == 1) { 1221 if (rbio->real_stripes - rbio->nr_data == 1) {
1152 p_stripe = bbio->num_stripes - 1; 1222 p_stripe = rbio->real_stripes - 1;
1153 } else if (bbio->num_stripes - rbio->nr_data == 2) { 1223 } else if (rbio->real_stripes - rbio->nr_data == 2) {
1154 p_stripe = bbio->num_stripes - 2; 1224 p_stripe = rbio->real_stripes - 2;
1155 q_stripe = bbio->num_stripes - 1; 1225 q_stripe = rbio->real_stripes - 1;
1156 } else { 1226 } else {
1157 BUG(); 1227 BUG();
1158 } 1228 }
@@ -1169,7 +1239,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1169 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 1239 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1170 spin_unlock_irq(&rbio->bio_list_lock); 1240 spin_unlock_irq(&rbio->bio_list_lock);
1171 1241
1172 atomic_set(&rbio->bbio->error, 0); 1242 atomic_set(&rbio->error, 0);
1173 1243
1174 /* 1244 /*
1175 * now that we've set rmw_locked, run through the 1245 * now that we've set rmw_locked, run through the
@@ -1209,7 +1279,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1209 SetPageUptodate(p); 1279 SetPageUptodate(p);
1210 pointers[stripe++] = kmap(p); 1280 pointers[stripe++] = kmap(p);
1211 1281
1212 raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE, 1282 raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
1213 pointers); 1283 pointers);
1214 } else { 1284 } else {
1215 /* raid5 */ 1285 /* raid5 */
@@ -1218,7 +1288,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1218 } 1288 }
1219 1289
1220 1290
1221 for (stripe = 0; stripe < bbio->num_stripes; stripe++) 1291 for (stripe = 0; stripe < rbio->real_stripes; stripe++)
1222 kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); 1292 kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
1223 } 1293 }
1224 1294
@@ -1227,7 +1297,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1227 * higher layers (the bio_list in our rbio) and our p/q. Ignore 1297 * higher layers (the bio_list in our rbio) and our p/q. Ignore
1228 * everything else. 1298 * everything else.
1229 */ 1299 */
1230 for (stripe = 0; stripe < bbio->num_stripes; stripe++) { 1300 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1231 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { 1301 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
1232 struct page *page; 1302 struct page *page;
1233 if (stripe < rbio->nr_data) { 1303 if (stripe < rbio->nr_data) {
@@ -1245,8 +1315,34 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1245 } 1315 }
1246 } 1316 }
1247 1317
1248 atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list)); 1318 if (likely(!bbio->num_tgtdevs))
1249 BUG_ON(atomic_read(&bbio->stripes_pending) == 0); 1319 goto write_data;
1320
1321 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1322 if (!bbio->tgtdev_map[stripe])
1323 continue;
1324
1325 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
1326 struct page *page;
1327 if (stripe < rbio->nr_data) {
1328 page = page_in_rbio(rbio, stripe, pagenr, 1);
1329 if (!page)
1330 continue;
1331 } else {
1332 page = rbio_stripe_page(rbio, stripe, pagenr);
1333 }
1334
1335 ret = rbio_add_io_page(rbio, &bio_list, page,
1336 rbio->bbio->tgtdev_map[stripe],
1337 pagenr, rbio->stripe_len);
1338 if (ret)
1339 goto cleanup;
1340 }
1341 }
1342
1343write_data:
1344 atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
1345 BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
1250 1346
1251 while (1) { 1347 while (1) {
1252 bio = bio_list_pop(&bio_list); 1348 bio = bio_list_pop(&bio_list);
@@ -1283,7 +1379,8 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio,
1283 stripe = &rbio->bbio->stripes[i]; 1379 stripe = &rbio->bbio->stripes[i];
1284 stripe_start = stripe->physical; 1380 stripe_start = stripe->physical;
1285 if (physical >= stripe_start && 1381 if (physical >= stripe_start &&
1286 physical < stripe_start + rbio->stripe_len) { 1382 physical < stripe_start + rbio->stripe_len &&
1383 bio->bi_bdev == stripe->dev->bdev) {
1287 return i; 1384 return i;
1288 } 1385 }
1289 } 1386 }
@@ -1331,11 +1428,11 @@ static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
1331 if (rbio->faila == -1) { 1428 if (rbio->faila == -1) {
1332 /* first failure on this rbio */ 1429 /* first failure on this rbio */
1333 rbio->faila = failed; 1430 rbio->faila = failed;
1334 atomic_inc(&rbio->bbio->error); 1431 atomic_inc(&rbio->error);
1335 } else if (rbio->failb == -1) { 1432 } else if (rbio->failb == -1) {
1336 /* second failure on this rbio */ 1433 /* second failure on this rbio */
1337 rbio->failb = failed; 1434 rbio->failb = failed;
1338 atomic_inc(&rbio->bbio->error); 1435 atomic_inc(&rbio->error);
1339 } else { 1436 } else {
1340 ret = -EIO; 1437 ret = -EIO;
1341 } 1438 }
@@ -1394,11 +1491,11 @@ static void raid_rmw_end_io(struct bio *bio, int err)
1394 1491
1395 bio_put(bio); 1492 bio_put(bio);
1396 1493
1397 if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) 1494 if (!atomic_dec_and_test(&rbio->stripes_pending))
1398 return; 1495 return;
1399 1496
1400 err = 0; 1497 err = 0;
1401 if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) 1498 if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
1402 goto cleanup; 1499 goto cleanup;
1403 1500
1404 /* 1501 /*
@@ -1439,7 +1536,6 @@ static void async_read_rebuild(struct btrfs_raid_bio *rbio)
1439static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) 1536static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1440{ 1537{
1441 int bios_to_read = 0; 1538 int bios_to_read = 0;
1442 struct btrfs_bio *bbio = rbio->bbio;
1443 struct bio_list bio_list; 1539 struct bio_list bio_list;
1444 int ret; 1540 int ret;
1445 int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); 1541 int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
@@ -1455,7 +1551,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1455 1551
1456 index_rbio_pages(rbio); 1552 index_rbio_pages(rbio);
1457 1553
1458 atomic_set(&rbio->bbio->error, 0); 1554 atomic_set(&rbio->error, 0);
1459 /* 1555 /*
1460 * build a list of bios to read all the missing parts of this 1556 * build a list of bios to read all the missing parts of this
1461 * stripe 1557 * stripe
@@ -1503,7 +1599,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1503 * the bbio may be freed once we submit the last bio. Make sure 1599 * the bbio may be freed once we submit the last bio. Make sure
1504 * not to touch it after that 1600 * not to touch it after that
1505 */ 1601 */
1506 atomic_set(&bbio->stripes_pending, bios_to_read); 1602 atomic_set(&rbio->stripes_pending, bios_to_read);
1507 while (1) { 1603 while (1) {
1508 bio = bio_list_pop(&bio_list); 1604 bio = bio_list_pop(&bio_list);
1509 if (!bio) 1605 if (!bio)
@@ -1686,19 +1782,30 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
1686 struct btrfs_raid_bio *rbio; 1782 struct btrfs_raid_bio *rbio;
1687 struct btrfs_plug_cb *plug = NULL; 1783 struct btrfs_plug_cb *plug = NULL;
1688 struct blk_plug_cb *cb; 1784 struct blk_plug_cb *cb;
1785 int ret;
1689 1786
1690 rbio = alloc_rbio(root, bbio, raid_map, stripe_len); 1787 rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
1691 if (IS_ERR(rbio)) 1788 if (IS_ERR(rbio)) {
1789 __free_bbio_and_raid_map(bbio, raid_map, 1);
1692 return PTR_ERR(rbio); 1790 return PTR_ERR(rbio);
1791 }
1693 bio_list_add(&rbio->bio_list, bio); 1792 bio_list_add(&rbio->bio_list, bio);
1694 rbio->bio_list_bytes = bio->bi_iter.bi_size; 1793 rbio->bio_list_bytes = bio->bi_iter.bi_size;
1794 rbio->operation = BTRFS_RBIO_WRITE;
1795
1796 btrfs_bio_counter_inc_noblocked(root->fs_info);
1797 rbio->generic_bio_cnt = 1;
1695 1798
1696 /* 1799 /*
1697 * don't plug on full rbios, just get them out the door 1800 * don't plug on full rbios, just get them out the door
1698 * as quickly as we can 1801 * as quickly as we can
1699 */ 1802 */
1700 if (rbio_is_full(rbio)) 1803 if (rbio_is_full(rbio)) {
1701 return full_stripe_write(rbio); 1804 ret = full_stripe_write(rbio);
1805 if (ret)
1806 btrfs_bio_counter_dec(root->fs_info);
1807 return ret;
1808 }
1702 1809
1703 cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info, 1810 cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info,
1704 sizeof(*plug)); 1811 sizeof(*plug));
@@ -1709,10 +1816,13 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
1709 INIT_LIST_HEAD(&plug->rbio_list); 1816 INIT_LIST_HEAD(&plug->rbio_list);
1710 } 1817 }
1711 list_add_tail(&rbio->plug_list, &plug->rbio_list); 1818 list_add_tail(&rbio->plug_list, &plug->rbio_list);
1819 ret = 0;
1712 } else { 1820 } else {
1713 return __raid56_parity_write(rbio); 1821 ret = __raid56_parity_write(rbio);
1822 if (ret)
1823 btrfs_bio_counter_dec(root->fs_info);
1714 } 1824 }
1715 return 0; 1825 return ret;
1716} 1826}
1717 1827
1718/* 1828/*
@@ -1730,7 +1840,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1730 int err; 1840 int err;
1731 int i; 1841 int i;
1732 1842
1733 pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *), 1843 pointers = kzalloc(rbio->real_stripes * sizeof(void *),
1734 GFP_NOFS); 1844 GFP_NOFS);
1735 if (!pointers) { 1845 if (!pointers) {
1736 err = -ENOMEM; 1846 err = -ENOMEM;
@@ -1740,7 +1850,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1740 faila = rbio->faila; 1850 faila = rbio->faila;
1741 failb = rbio->failb; 1851 failb = rbio->failb;
1742 1852
1743 if (rbio->read_rebuild) { 1853 if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
1744 spin_lock_irq(&rbio->bio_list_lock); 1854 spin_lock_irq(&rbio->bio_list_lock);
1745 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 1855 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1746 spin_unlock_irq(&rbio->bio_list_lock); 1856 spin_unlock_irq(&rbio->bio_list_lock);
@@ -1749,15 +1859,23 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1749 index_rbio_pages(rbio); 1859 index_rbio_pages(rbio);
1750 1860
1751 for (pagenr = 0; pagenr < nr_pages; pagenr++) { 1861 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1862 /*
1863 * Now we just use bitmap to mark the horizontal stripes in
1864 * which we have data when doing parity scrub.
1865 */
1866 if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
1867 !test_bit(pagenr, rbio->dbitmap))
1868 continue;
1869
1752 /* setup our array of pointers with pages 1870 /* setup our array of pointers with pages
1753 * from each stripe 1871 * from each stripe
1754 */ 1872 */
1755 for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) { 1873 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1756 /* 1874 /*
1757 * if we're rebuilding a read, we have to use 1875 * if we're rebuilding a read, we have to use
1758 * pages from the bio list 1876 * pages from the bio list
1759 */ 1877 */
1760 if (rbio->read_rebuild && 1878 if (rbio->operation == BTRFS_RBIO_READ_REBUILD &&
1761 (stripe == faila || stripe == failb)) { 1879 (stripe == faila || stripe == failb)) {
1762 page = page_in_rbio(rbio, stripe, pagenr, 0); 1880 page = page_in_rbio(rbio, stripe, pagenr, 0);
1763 } else { 1881 } else {
@@ -1767,7 +1885,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1767 } 1885 }
1768 1886
1769 /* all raid6 handling here */ 1887 /* all raid6 handling here */
1770 if (rbio->raid_map[rbio->bbio->num_stripes - 1] == 1888 if (rbio->raid_map[rbio->real_stripes - 1] ==
1771 RAID6_Q_STRIPE) { 1889 RAID6_Q_STRIPE) {
1772 1890
1773 /* 1891 /*
@@ -1817,10 +1935,10 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1817 } 1935 }
1818 1936
1819 if (rbio->raid_map[failb] == RAID5_P_STRIPE) { 1937 if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
1820 raid6_datap_recov(rbio->bbio->num_stripes, 1938 raid6_datap_recov(rbio->real_stripes,
1821 PAGE_SIZE, faila, pointers); 1939 PAGE_SIZE, faila, pointers);
1822 } else { 1940 } else {
1823 raid6_2data_recov(rbio->bbio->num_stripes, 1941 raid6_2data_recov(rbio->real_stripes,
1824 PAGE_SIZE, faila, failb, 1942 PAGE_SIZE, faila, failb,
1825 pointers); 1943 pointers);
1826 } 1944 }
@@ -1850,7 +1968,7 @@ pstripe:
1850 * know they can be trusted. If this was a read reconstruction, 1968 * know they can be trusted. If this was a read reconstruction,
1851 * other endio functions will fiddle the uptodate bits 1969 * other endio functions will fiddle the uptodate bits
1852 */ 1970 */
1853 if (!rbio->read_rebuild) { 1971 if (rbio->operation == BTRFS_RBIO_WRITE) {
1854 for (i = 0; i < nr_pages; i++) { 1972 for (i = 0; i < nr_pages; i++) {
1855 if (faila != -1) { 1973 if (faila != -1) {
1856 page = rbio_stripe_page(rbio, faila, i); 1974 page = rbio_stripe_page(rbio, faila, i);
@@ -1862,12 +1980,12 @@ pstripe:
1862 } 1980 }
1863 } 1981 }
1864 } 1982 }
1865 for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) { 1983 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1866 /* 1984 /*
1867 * if we're rebuilding a read, we have to use 1985 * if we're rebuilding a read, we have to use
1868 * pages from the bio list 1986 * pages from the bio list
1869 */ 1987 */
1870 if (rbio->read_rebuild && 1988 if (rbio->operation == BTRFS_RBIO_READ_REBUILD &&
1871 (stripe == faila || stripe == failb)) { 1989 (stripe == faila || stripe == failb)) {
1872 page = page_in_rbio(rbio, stripe, pagenr, 0); 1990 page = page_in_rbio(rbio, stripe, pagenr, 0);
1873 } else { 1991 } else {
@@ -1882,9 +2000,9 @@ cleanup:
1882 kfree(pointers); 2000 kfree(pointers);
1883 2001
1884cleanup_io: 2002cleanup_io:
1885 2003 if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
1886 if (rbio->read_rebuild) { 2004 if (err == 0 &&
1887 if (err == 0) 2005 !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags))
1888 cache_rbio_pages(rbio); 2006 cache_rbio_pages(rbio);
1889 else 2007 else
1890 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 2008 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
@@ -1893,7 +2011,13 @@ cleanup_io:
1893 } else if (err == 0) { 2011 } else if (err == 0) {
1894 rbio->faila = -1; 2012 rbio->faila = -1;
1895 rbio->failb = -1; 2013 rbio->failb = -1;
1896 finish_rmw(rbio); 2014
2015 if (rbio->operation == BTRFS_RBIO_WRITE)
2016 finish_rmw(rbio);
2017 else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
2018 finish_parity_scrub(rbio, 0);
2019 else
2020 BUG();
1897 } else { 2021 } else {
1898 rbio_orig_end_io(rbio, err, 0); 2022 rbio_orig_end_io(rbio, err, 0);
1899 } 2023 }
@@ -1917,10 +2041,10 @@ static void raid_recover_end_io(struct bio *bio, int err)
1917 set_bio_pages_uptodate(bio); 2041 set_bio_pages_uptodate(bio);
1918 bio_put(bio); 2042 bio_put(bio);
1919 2043
1920 if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) 2044 if (!atomic_dec_and_test(&rbio->stripes_pending))
1921 return; 2045 return;
1922 2046
1923 if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) 2047 if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
1924 rbio_orig_end_io(rbio, -EIO, 0); 2048 rbio_orig_end_io(rbio, -EIO, 0);
1925 else 2049 else
1926 __raid_recover_end_io(rbio); 2050 __raid_recover_end_io(rbio);
@@ -1937,7 +2061,6 @@ static void raid_recover_end_io(struct bio *bio, int err)
1937static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) 2061static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
1938{ 2062{
1939 int bios_to_read = 0; 2063 int bios_to_read = 0;
1940 struct btrfs_bio *bbio = rbio->bbio;
1941 struct bio_list bio_list; 2064 struct bio_list bio_list;
1942 int ret; 2065 int ret;
1943 int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); 2066 int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
@@ -1951,16 +2074,16 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
1951 if (ret) 2074 if (ret)
1952 goto cleanup; 2075 goto cleanup;
1953 2076
1954 atomic_set(&rbio->bbio->error, 0); 2077 atomic_set(&rbio->error, 0);
1955 2078
1956 /* 2079 /*
1957 * read everything that hasn't failed. Thanks to the 2080 * read everything that hasn't failed. Thanks to the
1958 * stripe cache, it is possible that some or all of these 2081 * stripe cache, it is possible that some or all of these
1959 * pages are going to be uptodate. 2082 * pages are going to be uptodate.
1960 */ 2083 */
1961 for (stripe = 0; stripe < bbio->num_stripes; stripe++) { 2084 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1962 if (rbio->faila == stripe || rbio->failb == stripe) { 2085 if (rbio->faila == stripe || rbio->failb == stripe) {
1963 atomic_inc(&rbio->bbio->error); 2086 atomic_inc(&rbio->error);
1964 continue; 2087 continue;
1965 } 2088 }
1966 2089
@@ -1990,7 +2113,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
1990 * were up to date, or we might have no bios to read because 2113 * were up to date, or we might have no bios to read because
1991 * the devices were gone. 2114 * the devices were gone.
1992 */ 2115 */
1993 if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) { 2116 if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) {
1994 __raid_recover_end_io(rbio); 2117 __raid_recover_end_io(rbio);
1995 goto out; 2118 goto out;
1996 } else { 2119 } else {
@@ -2002,7 +2125,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
2002 * the bbio may be freed once we submit the last bio. Make sure 2125 * the bbio may be freed once we submit the last bio. Make sure
2003 * not to touch it after that 2126 * not to touch it after that
2004 */ 2127 */
2005 atomic_set(&bbio->stripes_pending, bios_to_read); 2128 atomic_set(&rbio->stripes_pending, bios_to_read);
2006 while (1) { 2129 while (1) {
2007 bio = bio_list_pop(&bio_list); 2130 bio = bio_list_pop(&bio_list);
2008 if (!bio) 2131 if (!bio)
@@ -2021,7 +2144,7 @@ out:
2021 return 0; 2144 return 0;
2022 2145
2023cleanup: 2146cleanup:
2024 if (rbio->read_rebuild) 2147 if (rbio->operation == BTRFS_RBIO_READ_REBUILD)
2025 rbio_orig_end_io(rbio, -EIO, 0); 2148 rbio_orig_end_io(rbio, -EIO, 0);
2026 return -EIO; 2149 return -EIO;
2027} 2150}
@@ -2034,34 +2157,42 @@ cleanup:
2034 */ 2157 */
2035int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, 2158int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
2036 struct btrfs_bio *bbio, u64 *raid_map, 2159 struct btrfs_bio *bbio, u64 *raid_map,
2037 u64 stripe_len, int mirror_num) 2160 u64 stripe_len, int mirror_num, int generic_io)
2038{ 2161{
2039 struct btrfs_raid_bio *rbio; 2162 struct btrfs_raid_bio *rbio;
2040 int ret; 2163 int ret;
2041 2164
2042 rbio = alloc_rbio(root, bbio, raid_map, stripe_len); 2165 rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
2043 if (IS_ERR(rbio)) 2166 if (IS_ERR(rbio)) {
2167 __free_bbio_and_raid_map(bbio, raid_map, generic_io);
2044 return PTR_ERR(rbio); 2168 return PTR_ERR(rbio);
2169 }
2045 2170
2046 rbio->read_rebuild = 1; 2171 rbio->operation = BTRFS_RBIO_READ_REBUILD;
2047 bio_list_add(&rbio->bio_list, bio); 2172 bio_list_add(&rbio->bio_list, bio);
2048 rbio->bio_list_bytes = bio->bi_iter.bi_size; 2173 rbio->bio_list_bytes = bio->bi_iter.bi_size;
2049 2174
2050 rbio->faila = find_logical_bio_stripe(rbio, bio); 2175 rbio->faila = find_logical_bio_stripe(rbio, bio);
2051 if (rbio->faila == -1) { 2176 if (rbio->faila == -1) {
2052 BUG(); 2177 BUG();
2053 kfree(raid_map); 2178 __free_bbio_and_raid_map(bbio, raid_map, generic_io);
2054 kfree(bbio);
2055 kfree(rbio); 2179 kfree(rbio);
2056 return -EIO; 2180 return -EIO;
2057 } 2181 }
2058 2182
2183 if (generic_io) {
2184 btrfs_bio_counter_inc_noblocked(root->fs_info);
2185 rbio->generic_bio_cnt = 1;
2186 } else {
2187 set_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags);
2188 }
2189
2059 /* 2190 /*
2060 * reconstruct from the q stripe if they are 2191 * reconstruct from the q stripe if they are
2061 * asking for mirror 3 2192 * asking for mirror 3
2062 */ 2193 */
2063 if (mirror_num == 3) 2194 if (mirror_num == 3)
2064 rbio->failb = bbio->num_stripes - 2; 2195 rbio->failb = rbio->real_stripes - 2;
2065 2196
2066 ret = lock_stripe_add(rbio); 2197 ret = lock_stripe_add(rbio);
2067 2198
@@ -2098,3 +2229,483 @@ static void read_rebuild_work(struct btrfs_work *work)
2098 rbio = container_of(work, struct btrfs_raid_bio, work); 2229 rbio = container_of(work, struct btrfs_raid_bio, work);
2099 __raid56_parity_recover(rbio); 2230 __raid56_parity_recover(rbio);
2100} 2231}
2232
2233/*
2234 * The following code is used to scrub/replace the parity stripe
2235 *
2236 * Note: We need make sure all the pages that add into the scrub/replace
2237 * raid bio are correct and not be changed during the scrub/replace. That
2238 * is those pages just hold metadata or file data with checksum.
2239 */
2240
2241struct btrfs_raid_bio *
2242raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
2243 struct btrfs_bio *bbio, u64 *raid_map,
2244 u64 stripe_len, struct btrfs_device *scrub_dev,
2245 unsigned long *dbitmap, int stripe_nsectors)
2246{
2247 struct btrfs_raid_bio *rbio;
2248 int i;
2249
2250 rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
2251 if (IS_ERR(rbio))
2252 return NULL;
2253 bio_list_add(&rbio->bio_list, bio);
2254 /*
2255 * This is a special bio which is used to hold the completion handler
2256 * and make the scrub rbio is similar to the other types
2257 */
2258 ASSERT(!bio->bi_iter.bi_size);
2259 rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
2260
2261 for (i = 0; i < rbio->real_stripes; i++) {
2262 if (bbio->stripes[i].dev == scrub_dev) {
2263 rbio->scrubp = i;
2264 break;
2265 }
2266 }
2267
2268 /* Now we just support the sectorsize equals to page size */
2269 ASSERT(root->sectorsize == PAGE_SIZE);
2270 ASSERT(rbio->stripe_npages == stripe_nsectors);
2271 bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
2272
2273 return rbio;
2274}
2275
2276void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
2277 struct page *page, u64 logical)
2278{
2279 int stripe_offset;
2280 int index;
2281
2282 ASSERT(logical >= rbio->raid_map[0]);
2283 ASSERT(logical + PAGE_SIZE <= rbio->raid_map[0] +
2284 rbio->stripe_len * rbio->nr_data);
2285 stripe_offset = (int)(logical - rbio->raid_map[0]);
2286 index = stripe_offset >> PAGE_CACHE_SHIFT;
2287 rbio->bio_pages[index] = page;
2288}
2289
2290/*
2291 * We just scrub the parity that we have correct data on the same horizontal,
2292 * so we needn't allocate all pages for all the stripes.
2293 */
2294static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
2295{
2296 int i;
2297 int bit;
2298 int index;
2299 struct page *page;
2300
2301 for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
2302 for (i = 0; i < rbio->real_stripes; i++) {
2303 index = i * rbio->stripe_npages + bit;
2304 if (rbio->stripe_pages[index])
2305 continue;
2306
2307 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
2308 if (!page)
2309 return -ENOMEM;
2310 rbio->stripe_pages[index] = page;
2311 ClearPageUptodate(page);
2312 }
2313 }
2314 return 0;
2315}
2316
2317/*
2318 * end io function used by finish_rmw. When we finally
2319 * get here, we've written a full stripe
2320 */
2321static void raid_write_parity_end_io(struct bio *bio, int err)
2322{
2323 struct btrfs_raid_bio *rbio = bio->bi_private;
2324
2325 if (err)
2326 fail_bio_stripe(rbio, bio);
2327
2328 bio_put(bio);
2329
2330 if (!atomic_dec_and_test(&rbio->stripes_pending))
2331 return;
2332
2333 err = 0;
2334
2335 if (atomic_read(&rbio->error))
2336 err = -EIO;
2337
2338 rbio_orig_end_io(rbio, err, 0);
2339}
2340
2341static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
2342 int need_check)
2343{
2344 struct btrfs_bio *bbio = rbio->bbio;
2345 void *pointers[rbio->real_stripes];
2346 DECLARE_BITMAP(pbitmap, rbio->stripe_npages);
2347 int nr_data = rbio->nr_data;
2348 int stripe;
2349 int pagenr;
2350 int p_stripe = -1;
2351 int q_stripe = -1;
2352 struct page *p_page = NULL;
2353 struct page *q_page = NULL;
2354 struct bio_list bio_list;
2355 struct bio *bio;
2356 int is_replace = 0;
2357 int ret;
2358
2359 bio_list_init(&bio_list);
2360
2361 if (rbio->real_stripes - rbio->nr_data == 1) {
2362 p_stripe = rbio->real_stripes - 1;
2363 } else if (rbio->real_stripes - rbio->nr_data == 2) {
2364 p_stripe = rbio->real_stripes - 2;
2365 q_stripe = rbio->real_stripes - 1;
2366 } else {
2367 BUG();
2368 }
2369
2370 if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) {
2371 is_replace = 1;
2372 bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages);
2373 }
2374
2375 /*
2376 * Because the higher layers(scrubber) are unlikely to
2377 * use this area of the disk again soon, so don't cache
2378 * it.
2379 */
2380 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2381
2382 if (!need_check)
2383 goto writeback;
2384
2385 p_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
2386 if (!p_page)
2387 goto cleanup;
2388 SetPageUptodate(p_page);
2389
2390 if (q_stripe != -1) {
2391 q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
2392 if (!q_page) {
2393 __free_page(p_page);
2394 goto cleanup;
2395 }
2396 SetPageUptodate(q_page);
2397 }
2398
2399 atomic_set(&rbio->error, 0);
2400
2401 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2402 struct page *p;
2403 void *parity;
2404 /* first collect one page from each data stripe */
2405 for (stripe = 0; stripe < nr_data; stripe++) {
2406 p = page_in_rbio(rbio, stripe, pagenr, 0);
2407 pointers[stripe] = kmap(p);
2408 }
2409
2410 /* then add the parity stripe */
2411 pointers[stripe++] = kmap(p_page);
2412
2413 if (q_stripe != -1) {
2414
2415 /*
2416 * raid6, add the qstripe and call the
2417 * library function to fill in our p/q
2418 */
2419 pointers[stripe++] = kmap(q_page);
2420
2421 raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
2422 pointers);
2423 } else {
2424 /* raid5 */
2425 memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
2426 run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
2427 }
2428
2429 /* Check scrubbing pairty and repair it */
2430 p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2431 parity = kmap(p);
2432 if (memcmp(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE))
2433 memcpy(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE);
2434 else
2435 /* Parity is right, needn't writeback */
2436 bitmap_clear(rbio->dbitmap, pagenr, 1);
2437 kunmap(p);
2438
2439 for (stripe = 0; stripe < rbio->real_stripes; stripe++)
2440 kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
2441 }
2442
2443 __free_page(p_page);
2444 if (q_page)
2445 __free_page(q_page);
2446
2447writeback:
2448 /*
2449 * time to start writing. Make bios for everything from the
2450 * higher layers (the bio_list in our rbio) and our p/q. Ignore
2451 * everything else.
2452 */
2453 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2454 struct page *page;
2455
2456 page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2457 ret = rbio_add_io_page(rbio, &bio_list,
2458 page, rbio->scrubp, pagenr, rbio->stripe_len);
2459 if (ret)
2460 goto cleanup;
2461 }
2462
2463 if (!is_replace)
2464 goto submit_write;
2465
2466 for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) {
2467 struct page *page;
2468
2469 page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2470 ret = rbio_add_io_page(rbio, &bio_list, page,
2471 bbio->tgtdev_map[rbio->scrubp],
2472 pagenr, rbio->stripe_len);
2473 if (ret)
2474 goto cleanup;
2475 }
2476
2477submit_write:
2478 nr_data = bio_list_size(&bio_list);
2479 if (!nr_data) {
2480 /* Every parity is right */
2481 rbio_orig_end_io(rbio, 0, 0);
2482 return;
2483 }
2484
2485 atomic_set(&rbio->stripes_pending, nr_data);
2486
2487 while (1) {
2488 bio = bio_list_pop(&bio_list);
2489 if (!bio)
2490 break;
2491
2492 bio->bi_private = rbio;
2493 bio->bi_end_io = raid_write_parity_end_io;
2494 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
2495 submit_bio(WRITE, bio);
2496 }
2497 return;
2498
2499cleanup:
2500 rbio_orig_end_io(rbio, -EIO, 0);
2501}
2502
2503static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
2504{
2505 if (stripe >= 0 && stripe < rbio->nr_data)
2506 return 1;
2507 return 0;
2508}
2509
2510/*
2511 * While we're doing the parity check and repair, we could have errors
2512 * in reading pages off the disk. This checks for errors and if we're
2513 * not able to read the page it'll trigger parity reconstruction. The
2514 * parity scrub will be finished after we've reconstructed the failed
2515 * stripes
2516 */
2517static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
2518{
2519 if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
2520 goto cleanup;
2521
2522 if (rbio->faila >= 0 || rbio->failb >= 0) {
2523 int dfail = 0, failp = -1;
2524
2525 if (is_data_stripe(rbio, rbio->faila))
2526 dfail++;
2527 else if (is_parity_stripe(rbio->faila))
2528 failp = rbio->faila;
2529
2530 if (is_data_stripe(rbio, rbio->failb))
2531 dfail++;
2532 else if (is_parity_stripe(rbio->failb))
2533 failp = rbio->failb;
2534
2535 /*
2536 * Because we can not use a scrubbing parity to repair
2537 * the data, so the capability of the repair is declined.
2538 * (In the case of RAID5, we can not repair anything)
2539 */
2540 if (dfail > rbio->bbio->max_errors - 1)
2541 goto cleanup;
2542
2543 /*
2544 * If all data is good, only parity is correctly, just
2545 * repair the parity.
2546 */
2547 if (dfail == 0) {
2548 finish_parity_scrub(rbio, 0);
2549 return;
2550 }
2551
2552 /*
2553 * Here means we got one corrupted data stripe and one
2554 * corrupted parity on RAID6, if the corrupted parity
2555 * is scrubbing parity, luckly, use the other one to repair
2556 * the data, or we can not repair the data stripe.
2557 */
2558 if (failp != rbio->scrubp)
2559 goto cleanup;
2560
2561 __raid_recover_end_io(rbio);
2562 } else {
2563 finish_parity_scrub(rbio, 1);
2564 }
2565 return;
2566
2567cleanup:
2568 rbio_orig_end_io(rbio, -EIO, 0);
2569}
2570
2571/*
2572 * end io for the read phase of the rmw cycle. All the bios here are physical
2573 * stripe bios we've read from the disk so we can recalculate the parity of the
2574 * stripe.
2575 *
2576 * This will usually kick off finish_rmw once all the bios are read in, but it
2577 * may trigger parity reconstruction if we had any errors along the way
2578 */
2579static void raid56_parity_scrub_end_io(struct bio *bio, int err)
2580{
2581 struct btrfs_raid_bio *rbio = bio->bi_private;
2582
2583 if (err)
2584 fail_bio_stripe(rbio, bio);
2585 else
2586 set_bio_pages_uptodate(bio);
2587
2588 bio_put(bio);
2589
2590 if (!atomic_dec_and_test(&rbio->stripes_pending))
2591 return;
2592
2593 /*
2594 * this will normally call finish_rmw to start our write
2595 * but if there are any failed stripes we'll reconstruct
2596 * from parity first
2597 */
2598 validate_rbio_for_parity_scrub(rbio);
2599}
2600
2601static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
2602{
2603 int bios_to_read = 0;
2604 struct bio_list bio_list;
2605 int ret;
2606 int pagenr;
2607 int stripe;
2608 struct bio *bio;
2609
2610 ret = alloc_rbio_essential_pages(rbio);
2611 if (ret)
2612 goto cleanup;
2613
2614 bio_list_init(&bio_list);
2615
2616 atomic_set(&rbio->error, 0);
2617 /*
2618 * build a list of bios to read all the missing parts of this
2619 * stripe
2620 */
2621 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
2622 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2623 struct page *page;
2624 /*
2625 * we want to find all the pages missing from
2626 * the rbio and read them from the disk. If
2627 * page_in_rbio finds a page in the bio list
2628 * we don't need to read it off the stripe.
2629 */
2630 page = page_in_rbio(rbio, stripe, pagenr, 1);
2631 if (page)
2632 continue;
2633
2634 page = rbio_stripe_page(rbio, stripe, pagenr);
2635 /*
2636 * the bio cache may have handed us an uptodate
2637 * page. If so, be happy and use it
2638 */
2639 if (PageUptodate(page))
2640 continue;
2641
2642 ret = rbio_add_io_page(rbio, &bio_list, page,
2643 stripe, pagenr, rbio->stripe_len);
2644 if (ret)
2645 goto cleanup;
2646 }
2647 }
2648
2649 bios_to_read = bio_list_size(&bio_list);
2650 if (!bios_to_read) {
2651 /*
2652 * this can happen if others have merged with
2653 * us, it means there is nothing left to read.
2654 * But if there are missing devices it may not be
2655 * safe to do the full stripe write yet.
2656 */
2657 goto finish;
2658 }
2659
2660 /*
2661 * the bbio may be freed once we submit the last bio. Make sure
2662 * not to touch it after that
2663 */
2664 atomic_set(&rbio->stripes_pending, bios_to_read);
2665 while (1) {
2666 bio = bio_list_pop(&bio_list);
2667 if (!bio)
2668 break;
2669
2670 bio->bi_private = rbio;
2671 bio->bi_end_io = raid56_parity_scrub_end_io;
2672
2673 btrfs_bio_wq_end_io(rbio->fs_info, bio,
2674 BTRFS_WQ_ENDIO_RAID56);
2675
2676 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
2677 submit_bio(READ, bio);
2678 }
2679 /* the actual write will happen once the reads are done */
2680 return;
2681
2682cleanup:
2683 rbio_orig_end_io(rbio, -EIO, 0);
2684 return;
2685
2686finish:
2687 validate_rbio_for_parity_scrub(rbio);
2688}
2689
2690static void scrub_parity_work(struct btrfs_work *work)
2691{
2692 struct btrfs_raid_bio *rbio;
2693
2694 rbio = container_of(work, struct btrfs_raid_bio, work);
2695 raid56_parity_scrub_stripe(rbio);
2696}
2697
2698static void async_scrub_parity(struct btrfs_raid_bio *rbio)
2699{
2700 btrfs_init_work(&rbio->work, btrfs_rmw_helper,
2701 scrub_parity_work, NULL, NULL);
2702
2703 btrfs_queue_work(rbio->fs_info->rmw_workers,
2704 &rbio->work);
2705}
2706
2707void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
2708{
2709 if (!lock_stripe_add(rbio))
2710 async_scrub_parity(rbio);
2711}
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index ea5d73bfdfbe..31d4a157b5e3 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -39,13 +39,25 @@ static inline int nr_data_stripes(struct map_lookup *map)
39#define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \ 39#define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \
40 ((x) == RAID6_Q_STRIPE)) 40 ((x) == RAID6_Q_STRIPE))
41 41
42struct btrfs_raid_bio;
43struct btrfs_device;
44
42int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, 45int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
43 struct btrfs_bio *bbio, u64 *raid_map, 46 struct btrfs_bio *bbio, u64 *raid_map,
44 u64 stripe_len, int mirror_num); 47 u64 stripe_len, int mirror_num, int generic_io);
45int raid56_parity_write(struct btrfs_root *root, struct bio *bio, 48int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
46 struct btrfs_bio *bbio, u64 *raid_map, 49 struct btrfs_bio *bbio, u64 *raid_map,
47 u64 stripe_len); 50 u64 stripe_len);
48 51
52struct btrfs_raid_bio *
53raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
54 struct btrfs_bio *bbio, u64 *raid_map,
55 u64 stripe_len, struct btrfs_device *scrub_dev,
56 unsigned long *dbitmap, int stripe_nsectors);
57void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
58 struct page *page, u64 logical);
59void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio);
60
49int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); 61int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
50void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info); 62void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);
51#endif 63#endif
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 4325bb0111d9..f2bb13a23f86 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -63,10 +63,18 @@ struct scrub_ctx;
63 */ 63 */
64#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */ 64#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */
65 65
66struct scrub_recover {
67 atomic_t refs;
68 struct btrfs_bio *bbio;
69 u64 *raid_map;
70 u64 map_length;
71};
72
66struct scrub_page { 73struct scrub_page {
67 struct scrub_block *sblock; 74 struct scrub_block *sblock;
68 struct page *page; 75 struct page *page;
69 struct btrfs_device *dev; 76 struct btrfs_device *dev;
77 struct list_head list;
70 u64 flags; /* extent flags */ 78 u64 flags; /* extent flags */
71 u64 generation; 79 u64 generation;
72 u64 logical; 80 u64 logical;
@@ -79,6 +87,8 @@ struct scrub_page {
79 unsigned int io_error:1; 87 unsigned int io_error:1;
80 }; 88 };
81 u8 csum[BTRFS_CSUM_SIZE]; 89 u8 csum[BTRFS_CSUM_SIZE];
90
91 struct scrub_recover *recover;
82}; 92};
83 93
84struct scrub_bio { 94struct scrub_bio {
@@ -105,14 +115,52 @@ struct scrub_block {
105 atomic_t outstanding_pages; 115 atomic_t outstanding_pages;
106 atomic_t ref_count; /* free mem on transition to zero */ 116 atomic_t ref_count; /* free mem on transition to zero */
107 struct scrub_ctx *sctx; 117 struct scrub_ctx *sctx;
118 struct scrub_parity *sparity;
108 struct { 119 struct {
109 unsigned int header_error:1; 120 unsigned int header_error:1;
110 unsigned int checksum_error:1; 121 unsigned int checksum_error:1;
111 unsigned int no_io_error_seen:1; 122 unsigned int no_io_error_seen:1;
112 unsigned int generation_error:1; /* also sets header_error */ 123 unsigned int generation_error:1; /* also sets header_error */
124
125 /* The following is for the data used to check parity */
126 /* It is for the data with checksum */
127 unsigned int data_corrected:1;
113 }; 128 };
114}; 129};
115 130
131/* Used for the chunks with parity stripe such RAID5/6 */
132struct scrub_parity {
133 struct scrub_ctx *sctx;
134
135 struct btrfs_device *scrub_dev;
136
137 u64 logic_start;
138
139 u64 logic_end;
140
141 int nsectors;
142
143 int stripe_len;
144
145 atomic_t ref_count;
146
147 struct list_head spages;
148
149 /* Work of parity check and repair */
150 struct btrfs_work work;
151
152 /* Mark the parity blocks which have data */
153 unsigned long *dbitmap;
154
155 /*
156 * Mark the parity blocks which have data, but errors happen when
157 * read data or check data
158 */
159 unsigned long *ebitmap;
160
161 unsigned long bitmap[0];
162};
163
116struct scrub_wr_ctx { 164struct scrub_wr_ctx {
117 struct scrub_bio *wr_curr_bio; 165 struct scrub_bio *wr_curr_bio;
118 struct btrfs_device *tgtdev; 166 struct btrfs_device *tgtdev;
@@ -196,7 +244,7 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
196static void scrub_recheck_block(struct btrfs_fs_info *fs_info, 244static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
197 struct scrub_block *sblock, int is_metadata, 245 struct scrub_block *sblock, int is_metadata,
198 int have_csum, u8 *csum, u64 generation, 246 int have_csum, u8 *csum, u64 generation,
199 u16 csum_size); 247 u16 csum_size, int retry_failed_mirror);
200static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, 248static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
201 struct scrub_block *sblock, 249 struct scrub_block *sblock,
202 int is_metadata, int have_csum, 250 int is_metadata, int have_csum,
@@ -218,6 +266,8 @@ static void scrub_block_get(struct scrub_block *sblock);
218static void scrub_block_put(struct scrub_block *sblock); 266static void scrub_block_put(struct scrub_block *sblock);
219static void scrub_page_get(struct scrub_page *spage); 267static void scrub_page_get(struct scrub_page *spage);
220static void scrub_page_put(struct scrub_page *spage); 268static void scrub_page_put(struct scrub_page *spage);
269static void scrub_parity_get(struct scrub_parity *sparity);
270static void scrub_parity_put(struct scrub_parity *sparity);
221static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, 271static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
222 struct scrub_page *spage); 272 struct scrub_page *spage);
223static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, 273static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
@@ -790,6 +840,20 @@ out:
790 scrub_pending_trans_workers_dec(sctx); 840 scrub_pending_trans_workers_dec(sctx);
791} 841}
792 842
843static inline void scrub_get_recover(struct scrub_recover *recover)
844{
845 atomic_inc(&recover->refs);
846}
847
848static inline void scrub_put_recover(struct scrub_recover *recover)
849{
850 if (atomic_dec_and_test(&recover->refs)) {
851 kfree(recover->bbio);
852 kfree(recover->raid_map);
853 kfree(recover);
854 }
855}
856
793/* 857/*
794 * scrub_handle_errored_block gets called when either verification of the 858 * scrub_handle_errored_block gets called when either verification of the
795 * pages failed or the bio failed to read, e.g. with EIO. In the latter 859 * pages failed or the bio failed to read, e.g. with EIO. In the latter
@@ -906,7 +970,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
906 970
907 /* build and submit the bios for the failed mirror, check checksums */ 971 /* build and submit the bios for the failed mirror, check checksums */
908 scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, 972 scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
909 csum, generation, sctx->csum_size); 973 csum, generation, sctx->csum_size, 1);
910 974
911 if (!sblock_bad->header_error && !sblock_bad->checksum_error && 975 if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
912 sblock_bad->no_io_error_seen) { 976 sblock_bad->no_io_error_seen) {
@@ -920,6 +984,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
920 */ 984 */
921 spin_lock(&sctx->stat_lock); 985 spin_lock(&sctx->stat_lock);
922 sctx->stat.unverified_errors++; 986 sctx->stat.unverified_errors++;
987 sblock_to_check->data_corrected = 1;
923 spin_unlock(&sctx->stat_lock); 988 spin_unlock(&sctx->stat_lock);
924 989
925 if (sctx->is_dev_replace) 990 if (sctx->is_dev_replace)
@@ -1019,7 +1084,7 @@ nodatasum_case:
1019 /* build and submit the bios, check checksums */ 1084 /* build and submit the bios, check checksums */
1020 scrub_recheck_block(fs_info, sblock_other, is_metadata, 1085 scrub_recheck_block(fs_info, sblock_other, is_metadata,
1021 have_csum, csum, generation, 1086 have_csum, csum, generation,
1022 sctx->csum_size); 1087 sctx->csum_size, 0);
1023 1088
1024 if (!sblock_other->header_error && 1089 if (!sblock_other->header_error &&
1025 !sblock_other->checksum_error && 1090 !sblock_other->checksum_error &&
@@ -1169,7 +1234,7 @@ nodatasum_case:
1169 */ 1234 */
1170 scrub_recheck_block(fs_info, sblock_bad, 1235 scrub_recheck_block(fs_info, sblock_bad,
1171 is_metadata, have_csum, csum, 1236 is_metadata, have_csum, csum,
1172 generation, sctx->csum_size); 1237 generation, sctx->csum_size, 1);
1173 if (!sblock_bad->header_error && 1238 if (!sblock_bad->header_error &&
1174 !sblock_bad->checksum_error && 1239 !sblock_bad->checksum_error &&
1175 sblock_bad->no_io_error_seen) 1240 sblock_bad->no_io_error_seen)
@@ -1180,6 +1245,7 @@ nodatasum_case:
1180corrected_error: 1245corrected_error:
1181 spin_lock(&sctx->stat_lock); 1246 spin_lock(&sctx->stat_lock);
1182 sctx->stat.corrected_errors++; 1247 sctx->stat.corrected_errors++;
1248 sblock_to_check->data_corrected = 1;
1183 spin_unlock(&sctx->stat_lock); 1249 spin_unlock(&sctx->stat_lock);
1184 printk_ratelimited_in_rcu(KERN_ERR 1250 printk_ratelimited_in_rcu(KERN_ERR
1185 "BTRFS: fixed up error at logical %llu on dev %s\n", 1251 "BTRFS: fixed up error at logical %llu on dev %s\n",
@@ -1201,11 +1267,18 @@ out:
1201 mirror_index++) { 1267 mirror_index++) {
1202 struct scrub_block *sblock = sblocks_for_recheck + 1268 struct scrub_block *sblock = sblocks_for_recheck +
1203 mirror_index; 1269 mirror_index;
1270 struct scrub_recover *recover;
1204 int page_index; 1271 int page_index;
1205 1272
1206 for (page_index = 0; page_index < sblock->page_count; 1273 for (page_index = 0; page_index < sblock->page_count;
1207 page_index++) { 1274 page_index++) {
1208 sblock->pagev[page_index]->sblock = NULL; 1275 sblock->pagev[page_index]->sblock = NULL;
1276 recover = sblock->pagev[page_index]->recover;
1277 if (recover) {
1278 scrub_put_recover(recover);
1279 sblock->pagev[page_index]->recover =
1280 NULL;
1281 }
1209 scrub_page_put(sblock->pagev[page_index]); 1282 scrub_page_put(sblock->pagev[page_index]);
1210 } 1283 }
1211 } 1284 }
@@ -1215,14 +1288,63 @@ out:
1215 return 0; 1288 return 0;
1216} 1289}
1217 1290
1291static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio, u64 *raid_map)
1292{
1293 if (raid_map) {
1294 if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
1295 return 3;
1296 else
1297 return 2;
1298 } else {
1299 return (int)bbio->num_stripes;
1300 }
1301}
1302
1303static inline void scrub_stripe_index_and_offset(u64 logical, u64 *raid_map,
1304 u64 mapped_length,
1305 int nstripes, int mirror,
1306 int *stripe_index,
1307 u64 *stripe_offset)
1308{
1309 int i;
1310
1311 if (raid_map) {
1312 /* RAID5/6 */
1313 for (i = 0; i < nstripes; i++) {
1314 if (raid_map[i] == RAID6_Q_STRIPE ||
1315 raid_map[i] == RAID5_P_STRIPE)
1316 continue;
1317
1318 if (logical >= raid_map[i] &&
1319 logical < raid_map[i] + mapped_length)
1320 break;
1321 }
1322
1323 *stripe_index = i;
1324 *stripe_offset = logical - raid_map[i];
1325 } else {
1326 /* The other RAID type */
1327 *stripe_index = mirror;
1328 *stripe_offset = 0;
1329 }
1330}
1331
1218static int scrub_setup_recheck_block(struct scrub_ctx *sctx, 1332static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
1219 struct btrfs_fs_info *fs_info, 1333 struct btrfs_fs_info *fs_info,
1220 struct scrub_block *original_sblock, 1334 struct scrub_block *original_sblock,
1221 u64 length, u64 logical, 1335 u64 length, u64 logical,
1222 struct scrub_block *sblocks_for_recheck) 1336 struct scrub_block *sblocks_for_recheck)
1223{ 1337{
1338 struct scrub_recover *recover;
1339 struct btrfs_bio *bbio;
1340 u64 *raid_map;
1341 u64 sublen;
1342 u64 mapped_length;
1343 u64 stripe_offset;
1344 int stripe_index;
1224 int page_index; 1345 int page_index;
1225 int mirror_index; 1346 int mirror_index;
1347 int nmirrors;
1226 int ret; 1348 int ret;
1227 1349
1228 /* 1350 /*
@@ -1233,23 +1355,39 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
1233 1355
1234 page_index = 0; 1356 page_index = 0;
1235 while (length > 0) { 1357 while (length > 0) {
1236 u64 sublen = min_t(u64, length, PAGE_SIZE); 1358 sublen = min_t(u64, length, PAGE_SIZE);
1237 u64 mapped_length = sublen; 1359 mapped_length = sublen;
1238 struct btrfs_bio *bbio = NULL; 1360 bbio = NULL;
1361 raid_map = NULL;
1239 1362
1240 /* 1363 /*
1241 * with a length of PAGE_SIZE, each returned stripe 1364 * with a length of PAGE_SIZE, each returned stripe
1242 * represents one mirror 1365 * represents one mirror
1243 */ 1366 */
1244 ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical, 1367 ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical,
1245 &mapped_length, &bbio, 0); 1368 &mapped_length, &bbio, 0, &raid_map);
1246 if (ret || !bbio || mapped_length < sublen) { 1369 if (ret || !bbio || mapped_length < sublen) {
1247 kfree(bbio); 1370 kfree(bbio);
1371 kfree(raid_map);
1248 return -EIO; 1372 return -EIO;
1249 } 1373 }
1250 1374
1375 recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
1376 if (!recover) {
1377 kfree(bbio);
1378 kfree(raid_map);
1379 return -ENOMEM;
1380 }
1381
1382 atomic_set(&recover->refs, 1);
1383 recover->bbio = bbio;
1384 recover->raid_map = raid_map;
1385 recover->map_length = mapped_length;
1386
1251 BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO); 1387 BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
1252 for (mirror_index = 0; mirror_index < (int)bbio->num_stripes; 1388
1389 nmirrors = scrub_nr_raid_mirrors(bbio, raid_map);
1390 for (mirror_index = 0; mirror_index < nmirrors;
1253 mirror_index++) { 1391 mirror_index++) {
1254 struct scrub_block *sblock; 1392 struct scrub_block *sblock;
1255 struct scrub_page *page; 1393 struct scrub_page *page;
@@ -1265,26 +1403,38 @@ leave_nomem:
1265 spin_lock(&sctx->stat_lock); 1403 spin_lock(&sctx->stat_lock);
1266 sctx->stat.malloc_errors++; 1404 sctx->stat.malloc_errors++;
1267 spin_unlock(&sctx->stat_lock); 1405 spin_unlock(&sctx->stat_lock);
1268 kfree(bbio); 1406 scrub_put_recover(recover);
1269 return -ENOMEM; 1407 return -ENOMEM;
1270 } 1408 }
1271 scrub_page_get(page); 1409 scrub_page_get(page);
1272 sblock->pagev[page_index] = page; 1410 sblock->pagev[page_index] = page;
1273 page->logical = logical; 1411 page->logical = logical;
1274 page->physical = bbio->stripes[mirror_index].physical; 1412
1413 scrub_stripe_index_and_offset(logical, raid_map,
1414 mapped_length,
1415 bbio->num_stripes,
1416 mirror_index,
1417 &stripe_index,
1418 &stripe_offset);
1419 page->physical = bbio->stripes[stripe_index].physical +
1420 stripe_offset;
1421 page->dev = bbio->stripes[stripe_index].dev;
1422
1275 BUG_ON(page_index >= original_sblock->page_count); 1423 BUG_ON(page_index >= original_sblock->page_count);
1276 page->physical_for_dev_replace = 1424 page->physical_for_dev_replace =
1277 original_sblock->pagev[page_index]-> 1425 original_sblock->pagev[page_index]->
1278 physical_for_dev_replace; 1426 physical_for_dev_replace;
1279 /* for missing devices, dev->bdev is NULL */ 1427 /* for missing devices, dev->bdev is NULL */
1280 page->dev = bbio->stripes[mirror_index].dev;
1281 page->mirror_num = mirror_index + 1; 1428 page->mirror_num = mirror_index + 1;
1282 sblock->page_count++; 1429 sblock->page_count++;
1283 page->page = alloc_page(GFP_NOFS); 1430 page->page = alloc_page(GFP_NOFS);
1284 if (!page->page) 1431 if (!page->page)
1285 goto leave_nomem; 1432 goto leave_nomem;
1433
1434 scrub_get_recover(recover);
1435 page->recover = recover;
1286 } 1436 }
1287 kfree(bbio); 1437 scrub_put_recover(recover);
1288 length -= sublen; 1438 length -= sublen;
1289 logical += sublen; 1439 logical += sublen;
1290 page_index++; 1440 page_index++;
@@ -1293,6 +1443,51 @@ leave_nomem:
1293 return 0; 1443 return 0;
1294} 1444}
1295 1445
1446struct scrub_bio_ret {
1447 struct completion event;
1448 int error;
1449};
1450
1451static void scrub_bio_wait_endio(struct bio *bio, int error)
1452{
1453 struct scrub_bio_ret *ret = bio->bi_private;
1454
1455 ret->error = error;
1456 complete(&ret->event);
1457}
1458
1459static inline int scrub_is_page_on_raid56(struct scrub_page *page)
1460{
1461 return page->recover && page->recover->raid_map;
1462}
1463
1464static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
1465 struct bio *bio,
1466 struct scrub_page *page)
1467{
1468 struct scrub_bio_ret done;
1469 int ret;
1470
1471 init_completion(&done.event);
1472 done.error = 0;
1473 bio->bi_iter.bi_sector = page->logical >> 9;
1474 bio->bi_private = &done;
1475 bio->bi_end_io = scrub_bio_wait_endio;
1476
1477 ret = raid56_parity_recover(fs_info->fs_root, bio, page->recover->bbio,
1478 page->recover->raid_map,
1479 page->recover->map_length,
1480 page->mirror_num, 0);
1481 if (ret)
1482 return ret;
1483
1484 wait_for_completion(&done.event);
1485 if (done.error)
1486 return -EIO;
1487
1488 return 0;
1489}
1490
1296/* 1491/*
1297 * this function will check the on disk data for checksum errors, header 1492 * this function will check the on disk data for checksum errors, header
1298 * errors and read I/O errors. If any I/O errors happen, the exact pages 1493 * errors and read I/O errors. If any I/O errors happen, the exact pages
@@ -1303,7 +1498,7 @@ leave_nomem:
1303static void scrub_recheck_block(struct btrfs_fs_info *fs_info, 1498static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1304 struct scrub_block *sblock, int is_metadata, 1499 struct scrub_block *sblock, int is_metadata,
1305 int have_csum, u8 *csum, u64 generation, 1500 int have_csum, u8 *csum, u64 generation,
1306 u16 csum_size) 1501 u16 csum_size, int retry_failed_mirror)
1307{ 1502{
1308 int page_num; 1503 int page_num;
1309 1504
@@ -1329,11 +1524,17 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1329 continue; 1524 continue;
1330 } 1525 }
1331 bio->bi_bdev = page->dev->bdev; 1526 bio->bi_bdev = page->dev->bdev;
1332 bio->bi_iter.bi_sector = page->physical >> 9;
1333 1527
1334 bio_add_page(bio, page->page, PAGE_SIZE, 0); 1528 bio_add_page(bio, page->page, PAGE_SIZE, 0);
1335 if (btrfsic_submit_bio_wait(READ, bio)) 1529 if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) {
1336 sblock->no_io_error_seen = 0; 1530 if (scrub_submit_raid56_bio_wait(fs_info, bio, page))
1531 sblock->no_io_error_seen = 0;
1532 } else {
1533 bio->bi_iter.bi_sector = page->physical >> 9;
1534
1535 if (btrfsic_submit_bio_wait(READ, bio))
1536 sblock->no_io_error_seen = 0;
1537 }
1337 1538
1338 bio_put(bio); 1539 bio_put(bio);
1339 } 1540 }
@@ -1486,6 +1687,13 @@ static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1486{ 1687{
1487 int page_num; 1688 int page_num;
1488 1689
1690 /*
1691 * This block is used for the check of the parity on the source device,
1692 * so the data needn't be written into the destination device.
1693 */
1694 if (sblock->sparity)
1695 return;
1696
1489 for (page_num = 0; page_num < sblock->page_count; page_num++) { 1697 for (page_num = 0; page_num < sblock->page_count; page_num++) {
1490 int ret; 1698 int ret;
1491 1699
@@ -1867,6 +2075,9 @@ static void scrub_block_put(struct scrub_block *sblock)
1867 if (atomic_dec_and_test(&sblock->ref_count)) { 2075 if (atomic_dec_and_test(&sblock->ref_count)) {
1868 int i; 2076 int i;
1869 2077
2078 if (sblock->sparity)
2079 scrub_parity_put(sblock->sparity);
2080
1870 for (i = 0; i < sblock->page_count; i++) 2081 for (i = 0; i < sblock->page_count; i++)
1871 scrub_page_put(sblock->pagev[i]); 2082 scrub_page_put(sblock->pagev[i]);
1872 kfree(sblock); 2083 kfree(sblock);
@@ -2124,9 +2335,51 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work)
2124 scrub_pending_bio_dec(sctx); 2335 scrub_pending_bio_dec(sctx);
2125} 2336}
2126 2337
2338static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
2339 unsigned long *bitmap,
2340 u64 start, u64 len)
2341{
2342 int offset;
2343 int nsectors;
2344 int sectorsize = sparity->sctx->dev_root->sectorsize;
2345
2346 if (len >= sparity->stripe_len) {
2347 bitmap_set(bitmap, 0, sparity->nsectors);
2348 return;
2349 }
2350
2351 start -= sparity->logic_start;
2352 offset = (int)do_div(start, sparity->stripe_len);
2353 offset /= sectorsize;
2354 nsectors = (int)len / sectorsize;
2355
2356 if (offset + nsectors <= sparity->nsectors) {
2357 bitmap_set(bitmap, offset, nsectors);
2358 return;
2359 }
2360
2361 bitmap_set(bitmap, offset, sparity->nsectors - offset);
2362 bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
2363}
2364
2365static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
2366 u64 start, u64 len)
2367{
2368 __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
2369}
2370
2371static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
2372 u64 start, u64 len)
2373{
2374 __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
2375}
2376
2127static void scrub_block_complete(struct scrub_block *sblock) 2377static void scrub_block_complete(struct scrub_block *sblock)
2128{ 2378{
2379 int corrupted = 0;
2380
2129 if (!sblock->no_io_error_seen) { 2381 if (!sblock->no_io_error_seen) {
2382 corrupted = 1;
2130 scrub_handle_errored_block(sblock); 2383 scrub_handle_errored_block(sblock);
2131 } else { 2384 } else {
2132 /* 2385 /*
@@ -2134,9 +2387,19 @@ static void scrub_block_complete(struct scrub_block *sblock)
2134 * dev replace case, otherwise write here in dev replace 2387 * dev replace case, otherwise write here in dev replace
2135 * case. 2388 * case.
2136 */ 2389 */
2137 if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace) 2390 corrupted = scrub_checksum(sblock);
2391 if (!corrupted && sblock->sctx->is_dev_replace)
2138 scrub_write_block_to_dev_replace(sblock); 2392 scrub_write_block_to_dev_replace(sblock);
2139 } 2393 }
2394
2395 if (sblock->sparity && corrupted && !sblock->data_corrected) {
2396 u64 start = sblock->pagev[0]->logical;
2397 u64 end = sblock->pagev[sblock->page_count - 1]->logical +
2398 PAGE_SIZE;
2399
2400 scrub_parity_mark_sectors_error(sblock->sparity,
2401 start, end - start);
2402 }
2140} 2403}
2141 2404
2142static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len, 2405static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
@@ -2228,6 +2491,132 @@ behind_scrub_pages:
2228 return 0; 2491 return 0;
2229} 2492}
2230 2493
2494static int scrub_pages_for_parity(struct scrub_parity *sparity,
2495 u64 logical, u64 len,
2496 u64 physical, struct btrfs_device *dev,
2497 u64 flags, u64 gen, int mirror_num, u8 *csum)
2498{
2499 struct scrub_ctx *sctx = sparity->sctx;
2500 struct scrub_block *sblock;
2501 int index;
2502
2503 sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
2504 if (!sblock) {
2505 spin_lock(&sctx->stat_lock);
2506 sctx->stat.malloc_errors++;
2507 spin_unlock(&sctx->stat_lock);
2508 return -ENOMEM;
2509 }
2510
2511 /* one ref inside this function, plus one for each page added to
2512 * a bio later on */
2513 atomic_set(&sblock->ref_count, 1);
2514 sblock->sctx = sctx;
2515 sblock->no_io_error_seen = 1;
2516 sblock->sparity = sparity;
2517 scrub_parity_get(sparity);
2518
2519 for (index = 0; len > 0; index++) {
2520 struct scrub_page *spage;
2521 u64 l = min_t(u64, len, PAGE_SIZE);
2522
2523 spage = kzalloc(sizeof(*spage), GFP_NOFS);
2524 if (!spage) {
2525leave_nomem:
2526 spin_lock(&sctx->stat_lock);
2527 sctx->stat.malloc_errors++;
2528 spin_unlock(&sctx->stat_lock);
2529 scrub_block_put(sblock);
2530 return -ENOMEM;
2531 }
2532 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2533 /* For scrub block */
2534 scrub_page_get(spage);
2535 sblock->pagev[index] = spage;
2536 /* For scrub parity */
2537 scrub_page_get(spage);
2538 list_add_tail(&spage->list, &sparity->spages);
2539 spage->sblock = sblock;
2540 spage->dev = dev;
2541 spage->flags = flags;
2542 spage->generation = gen;
2543 spage->logical = logical;
2544 spage->physical = physical;
2545 spage->mirror_num = mirror_num;
2546 if (csum) {
2547 spage->have_csum = 1;
2548 memcpy(spage->csum, csum, sctx->csum_size);
2549 } else {
2550 spage->have_csum = 0;
2551 }
2552 sblock->page_count++;
2553 spage->page = alloc_page(GFP_NOFS);
2554 if (!spage->page)
2555 goto leave_nomem;
2556 len -= l;
2557 logical += l;
2558 physical += l;
2559 }
2560
2561 WARN_ON(sblock->page_count == 0);
2562 for (index = 0; index < sblock->page_count; index++) {
2563 struct scrub_page *spage = sblock->pagev[index];
2564 int ret;
2565
2566 ret = scrub_add_page_to_rd_bio(sctx, spage);
2567 if (ret) {
2568 scrub_block_put(sblock);
2569 return ret;
2570 }
2571 }
2572
2573 /* last one frees, either here or in bio completion for last page */
2574 scrub_block_put(sblock);
2575 return 0;
2576}
2577
2578static int scrub_extent_for_parity(struct scrub_parity *sparity,
2579 u64 logical, u64 len,
2580 u64 physical, struct btrfs_device *dev,
2581 u64 flags, u64 gen, int mirror_num)
2582{
2583 struct scrub_ctx *sctx = sparity->sctx;
2584 int ret;
2585 u8 csum[BTRFS_CSUM_SIZE];
2586 u32 blocksize;
2587
2588 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2589 blocksize = sctx->sectorsize;
2590 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2591 blocksize = sctx->nodesize;
2592 } else {
2593 blocksize = sctx->sectorsize;
2594 WARN_ON(1);
2595 }
2596
2597 while (len) {
2598 u64 l = min_t(u64, len, blocksize);
2599 int have_csum = 0;
2600
2601 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2602 /* push csums to sbio */
2603 have_csum = scrub_find_csum(sctx, logical, l, csum);
2604 if (have_csum == 0)
2605 goto skip;
2606 }
2607 ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
2608 flags, gen, mirror_num,
2609 have_csum ? csum : NULL);
2610skip:
2611 if (ret)
2612 return ret;
2613 len -= l;
2614 logical += l;
2615 physical += l;
2616 }
2617 return 0;
2618}
2619
2231/* 2620/*
2232 * Given a physical address, this will calculate it's 2621 * Given a physical address, this will calculate it's
2233 * logical offset. if this is a parity stripe, it will return 2622 * logical offset. if this is a parity stripe, it will return
@@ -2236,7 +2625,8 @@ behind_scrub_pages:
2236 * return 0 if it is a data stripe, 1 means parity stripe. 2625 * return 0 if it is a data stripe, 1 means parity stripe.
2237 */ 2626 */
2238static int get_raid56_logic_offset(u64 physical, int num, 2627static int get_raid56_logic_offset(u64 physical, int num,
2239 struct map_lookup *map, u64 *offset) 2628 struct map_lookup *map, u64 *offset,
2629 u64 *stripe_start)
2240{ 2630{
2241 int i; 2631 int i;
2242 int j = 0; 2632 int j = 0;
@@ -2247,6 +2637,9 @@ static int get_raid56_logic_offset(u64 physical, int num,
2247 2637
2248 last_offset = (physical - map->stripes[num].physical) * 2638 last_offset = (physical - map->stripes[num].physical) *
2249 nr_data_stripes(map); 2639 nr_data_stripes(map);
2640 if (stripe_start)
2641 *stripe_start = last_offset;
2642
2250 *offset = last_offset; 2643 *offset = last_offset;
2251 for (i = 0; i < nr_data_stripes(map); i++) { 2644 for (i = 0; i < nr_data_stripes(map); i++) {
2252 *offset = last_offset + i * map->stripe_len; 2645 *offset = last_offset + i * map->stripe_len;
@@ -2269,13 +2662,330 @@ static int get_raid56_logic_offset(u64 physical, int num,
2269 return 1; 2662 return 1;
2270} 2663}
2271 2664
2665static void scrub_free_parity(struct scrub_parity *sparity)
2666{
2667 struct scrub_ctx *sctx = sparity->sctx;
2668 struct scrub_page *curr, *next;
2669 int nbits;
2670
2671 nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
2672 if (nbits) {
2673 spin_lock(&sctx->stat_lock);
2674 sctx->stat.read_errors += nbits;
2675 sctx->stat.uncorrectable_errors += nbits;
2676 spin_unlock(&sctx->stat_lock);
2677 }
2678
2679 list_for_each_entry_safe(curr, next, &sparity->spages, list) {
2680 list_del_init(&curr->list);
2681 scrub_page_put(curr);
2682 }
2683
2684 kfree(sparity);
2685}
2686
2687static void scrub_parity_bio_endio(struct bio *bio, int error)
2688{
2689 struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
2690 struct scrub_ctx *sctx = sparity->sctx;
2691
2692 if (error)
2693 bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2694 sparity->nsectors);
2695
2696 scrub_free_parity(sparity);
2697 scrub_pending_bio_dec(sctx);
2698 bio_put(bio);
2699}
2700
2701static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
2702{
2703 struct scrub_ctx *sctx = sparity->sctx;
2704 struct bio *bio;
2705 struct btrfs_raid_bio *rbio;
2706 struct scrub_page *spage;
2707 struct btrfs_bio *bbio = NULL;
2708 u64 *raid_map = NULL;
2709 u64 length;
2710 int ret;
2711
2712 if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
2713 sparity->nsectors))
2714 goto out;
2715
2716 length = sparity->logic_end - sparity->logic_start + 1;
2717 ret = btrfs_map_sblock(sctx->dev_root->fs_info, WRITE,
2718 sparity->logic_start,
2719 &length, &bbio, 0, &raid_map);
2720 if (ret || !bbio || !raid_map)
2721 goto bbio_out;
2722
2723 bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
2724 if (!bio)
2725 goto bbio_out;
2726
2727 bio->bi_iter.bi_sector = sparity->logic_start >> 9;
2728 bio->bi_private = sparity;
2729 bio->bi_end_io = scrub_parity_bio_endio;
2730
2731 rbio = raid56_parity_alloc_scrub_rbio(sctx->dev_root, bio, bbio,
2732 raid_map, length,
2733 sparity->scrub_dev,
2734 sparity->dbitmap,
2735 sparity->nsectors);
2736 if (!rbio)
2737 goto rbio_out;
2738
2739 list_for_each_entry(spage, &sparity->spages, list)
2740 raid56_parity_add_scrub_pages(rbio, spage->page,
2741 spage->logical);
2742
2743 scrub_pending_bio_inc(sctx);
2744 raid56_parity_submit_scrub_rbio(rbio);
2745 return;
2746
2747rbio_out:
2748 bio_put(bio);
2749bbio_out:
2750 kfree(bbio);
2751 kfree(raid_map);
2752 bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2753 sparity->nsectors);
2754 spin_lock(&sctx->stat_lock);
2755 sctx->stat.malloc_errors++;
2756 spin_unlock(&sctx->stat_lock);
2757out:
2758 scrub_free_parity(sparity);
2759}
2760
2761static inline int scrub_calc_parity_bitmap_len(int nsectors)
2762{
2763 return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * (BITS_PER_LONG / 8);
2764}
2765
2766static void scrub_parity_get(struct scrub_parity *sparity)
2767{
2768 atomic_inc(&sparity->ref_count);
2769}
2770
2771static void scrub_parity_put(struct scrub_parity *sparity)
2772{
2773 if (!atomic_dec_and_test(&sparity->ref_count))
2774 return;
2775
2776 scrub_parity_check_and_repair(sparity);
2777}
2778
2779static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
2780 struct map_lookup *map,
2781 struct btrfs_device *sdev,
2782 struct btrfs_path *path,
2783 u64 logic_start,
2784 u64 logic_end)
2785{
2786 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
2787 struct btrfs_root *root = fs_info->extent_root;
2788 struct btrfs_root *csum_root = fs_info->csum_root;
2789 struct btrfs_extent_item *extent;
2790 u64 flags;
2791 int ret;
2792 int slot;
2793 struct extent_buffer *l;
2794 struct btrfs_key key;
2795 u64 generation;
2796 u64 extent_logical;
2797 u64 extent_physical;
2798 u64 extent_len;
2799 struct btrfs_device *extent_dev;
2800 struct scrub_parity *sparity;
2801 int nsectors;
2802 int bitmap_len;
2803 int extent_mirror_num;
2804 int stop_loop = 0;
2805
2806 nsectors = map->stripe_len / root->sectorsize;
2807 bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
2808 sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
2809 GFP_NOFS);
2810 if (!sparity) {
2811 spin_lock(&sctx->stat_lock);
2812 sctx->stat.malloc_errors++;
2813 spin_unlock(&sctx->stat_lock);
2814 return -ENOMEM;
2815 }
2816
2817 sparity->stripe_len = map->stripe_len;
2818 sparity->nsectors = nsectors;
2819 sparity->sctx = sctx;
2820 sparity->scrub_dev = sdev;
2821 sparity->logic_start = logic_start;
2822 sparity->logic_end = logic_end;
2823 atomic_set(&sparity->ref_count, 1);
2824 INIT_LIST_HEAD(&sparity->spages);
2825 sparity->dbitmap = sparity->bitmap;
2826 sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
2827
2828 ret = 0;
2829 while (logic_start < logic_end) {
2830 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2831 key.type = BTRFS_METADATA_ITEM_KEY;
2832 else
2833 key.type = BTRFS_EXTENT_ITEM_KEY;
2834 key.objectid = logic_start;
2835 key.offset = (u64)-1;
2836
2837 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2838 if (ret < 0)
2839 goto out;
2840
2841 if (ret > 0) {
2842 ret = btrfs_previous_extent_item(root, path, 0);
2843 if (ret < 0)
2844 goto out;
2845 if (ret > 0) {
2846 btrfs_release_path(path);
2847 ret = btrfs_search_slot(NULL, root, &key,
2848 path, 0, 0);
2849 if (ret < 0)
2850 goto out;
2851 }
2852 }
2853
2854 stop_loop = 0;
2855 while (1) {
2856 u64 bytes;
2857
2858 l = path->nodes[0];
2859 slot = path->slots[0];
2860 if (slot >= btrfs_header_nritems(l)) {
2861 ret = btrfs_next_leaf(root, path);
2862 if (ret == 0)
2863 continue;
2864 if (ret < 0)
2865 goto out;
2866
2867 stop_loop = 1;
2868 break;
2869 }
2870 btrfs_item_key_to_cpu(l, &key, slot);
2871
2872 if (key.type == BTRFS_METADATA_ITEM_KEY)
2873 bytes = root->nodesize;
2874 else
2875 bytes = key.offset;
2876
2877 if (key.objectid + bytes <= logic_start)
2878 goto next;
2879
2880 if (key.type != BTRFS_EXTENT_ITEM_KEY &&
2881 key.type != BTRFS_METADATA_ITEM_KEY)
2882 goto next;
2883
2884 if (key.objectid > logic_end) {
2885 stop_loop = 1;
2886 break;
2887 }
2888
2889 while (key.objectid >= logic_start + map->stripe_len)
2890 logic_start += map->stripe_len;
2891
2892 extent = btrfs_item_ptr(l, slot,
2893 struct btrfs_extent_item);
2894 flags = btrfs_extent_flags(l, extent);
2895 generation = btrfs_extent_generation(l, extent);
2896
2897 if (key.objectid < logic_start &&
2898 (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
2899 btrfs_err(fs_info,
2900 "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
2901 key.objectid, logic_start);
2902 goto next;
2903 }
2904again:
2905 extent_logical = key.objectid;
2906 extent_len = bytes;
2907
2908 if (extent_logical < logic_start) {
2909 extent_len -= logic_start - extent_logical;
2910 extent_logical = logic_start;
2911 }
2912
2913 if (extent_logical + extent_len >
2914 logic_start + map->stripe_len)
2915 extent_len = logic_start + map->stripe_len -
2916 extent_logical;
2917
2918 scrub_parity_mark_sectors_data(sparity, extent_logical,
2919 extent_len);
2920
2921 scrub_remap_extent(fs_info, extent_logical,
2922 extent_len, &extent_physical,
2923 &extent_dev,
2924 &extent_mirror_num);
2925
2926 ret = btrfs_lookup_csums_range(csum_root,
2927 extent_logical,
2928 extent_logical + extent_len - 1,
2929 &sctx->csum_list, 1);
2930 if (ret)
2931 goto out;
2932
2933 ret = scrub_extent_for_parity(sparity, extent_logical,
2934 extent_len,
2935 extent_physical,
2936 extent_dev, flags,
2937 generation,
2938 extent_mirror_num);
2939 if (ret)
2940 goto out;
2941
2942 scrub_free_csums(sctx);
2943 if (extent_logical + extent_len <
2944 key.objectid + bytes) {
2945 logic_start += map->stripe_len;
2946
2947 if (logic_start >= logic_end) {
2948 stop_loop = 1;
2949 break;
2950 }
2951
2952 if (logic_start < key.objectid + bytes) {
2953 cond_resched();
2954 goto again;
2955 }
2956 }
2957next:
2958 path->slots[0]++;
2959 }
2960
2961 btrfs_release_path(path);
2962
2963 if (stop_loop)
2964 break;
2965
2966 logic_start += map->stripe_len;
2967 }
2968out:
2969 if (ret < 0)
2970 scrub_parity_mark_sectors_error(sparity, logic_start,
2971 logic_end - logic_start + 1);
2972 scrub_parity_put(sparity);
2973 scrub_submit(sctx);
2974 mutex_lock(&sctx->wr_ctx.wr_lock);
2975 scrub_wr_submit(sctx);
2976 mutex_unlock(&sctx->wr_ctx.wr_lock);
2977
2978 btrfs_release_path(path);
2979 return ret < 0 ? ret : 0;
2980}
2981
2272static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, 2982static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2273 struct map_lookup *map, 2983 struct map_lookup *map,
2274 struct btrfs_device *scrub_dev, 2984 struct btrfs_device *scrub_dev,
2275 int num, u64 base, u64 length, 2985 int num, u64 base, u64 length,
2276 int is_dev_replace) 2986 int is_dev_replace)
2277{ 2987{
2278 struct btrfs_path *path; 2988 struct btrfs_path *path, *ppath;
2279 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; 2989 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
2280 struct btrfs_root *root = fs_info->extent_root; 2990 struct btrfs_root *root = fs_info->extent_root;
2281 struct btrfs_root *csum_root = fs_info->csum_root; 2991 struct btrfs_root *csum_root = fs_info->csum_root;
@@ -2302,6 +3012,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2302 u64 extent_logical; 3012 u64 extent_logical;
2303 u64 extent_physical; 3013 u64 extent_physical;
2304 u64 extent_len; 3014 u64 extent_len;
3015 u64 stripe_logical;
3016 u64 stripe_end;
2305 struct btrfs_device *extent_dev; 3017 struct btrfs_device *extent_dev;
2306 int extent_mirror_num; 3018 int extent_mirror_num;
2307 int stop_loop = 0; 3019 int stop_loop = 0;
@@ -2327,7 +3039,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2327 mirror_num = num % map->num_stripes + 1; 3039 mirror_num = num % map->num_stripes + 1;
2328 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | 3040 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
2329 BTRFS_BLOCK_GROUP_RAID6)) { 3041 BTRFS_BLOCK_GROUP_RAID6)) {
2330 get_raid56_logic_offset(physical, num, map, &offset); 3042 get_raid56_logic_offset(physical, num, map, &offset, NULL);
2331 increment = map->stripe_len * nr_data_stripes(map); 3043 increment = map->stripe_len * nr_data_stripes(map);
2332 mirror_num = 1; 3044 mirror_num = 1;
2333 } else { 3045 } else {
@@ -2339,6 +3051,12 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2339 if (!path) 3051 if (!path)
2340 return -ENOMEM; 3052 return -ENOMEM;
2341 3053
3054 ppath = btrfs_alloc_path();
3055 if (!ppath) {
3056 btrfs_free_path(ppath);
3057 return -ENOMEM;
3058 }
3059
2342 /* 3060 /*
2343 * work on commit root. The related disk blocks are static as 3061 * work on commit root. The related disk blocks are static as
2344 * long as COW is applied. This means, it is save to rewrite 3062 * long as COW is applied. This means, it is save to rewrite
@@ -2357,7 +3075,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2357 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | 3075 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
2358 BTRFS_BLOCK_GROUP_RAID6)) { 3076 BTRFS_BLOCK_GROUP_RAID6)) {
2359 get_raid56_logic_offset(physical_end, num, 3077 get_raid56_logic_offset(physical_end, num,
2360 map, &logic_end); 3078 map, &logic_end, NULL);
2361 logic_end += base; 3079 logic_end += base;
2362 } else { 3080 } else {
2363 logic_end = logical + increment * nstripes; 3081 logic_end = logical + increment * nstripes;
@@ -2404,10 +3122,18 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2404 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | 3122 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
2405 BTRFS_BLOCK_GROUP_RAID6)) { 3123 BTRFS_BLOCK_GROUP_RAID6)) {
2406 ret = get_raid56_logic_offset(physical, num, 3124 ret = get_raid56_logic_offset(physical, num,
2407 map, &logical); 3125 map, &logical, &stripe_logical);
2408 logical += base; 3126 logical += base;
2409 if (ret) 3127 if (ret) {
3128 stripe_logical += base;
3129 stripe_end = stripe_logical + increment - 1;
3130 ret = scrub_raid56_parity(sctx, map, scrub_dev,
3131 ppath, stripe_logical,
3132 stripe_end);
3133 if (ret)
3134 goto out;
2410 goto skip; 3135 goto skip;
3136 }
2411 } 3137 }
2412 /* 3138 /*
2413 * canceled? 3139 * canceled?
@@ -2558,13 +3284,25 @@ again:
2558 * loop until we find next data stripe 3284 * loop until we find next data stripe
2559 * or we have finished all stripes. 3285 * or we have finished all stripes.
2560 */ 3286 */
2561 do { 3287loop:
2562 physical += map->stripe_len; 3288 physical += map->stripe_len;
2563 ret = get_raid56_logic_offset( 3289 ret = get_raid56_logic_offset(physical,
2564 physical, num, 3290 num, map, &logical,
2565 map, &logical); 3291 &stripe_logical);
2566 logical += base; 3292 logical += base;
2567 } while (physical < physical_end && ret); 3293
3294 if (ret && physical < physical_end) {
3295 stripe_logical += base;
3296 stripe_end = stripe_logical +
3297 increment - 1;
3298 ret = scrub_raid56_parity(sctx,
3299 map, scrub_dev, ppath,
3300 stripe_logical,
3301 stripe_end);
3302 if (ret)
3303 goto out;
3304 goto loop;
3305 }
2568 } else { 3306 } else {
2569 physical += map->stripe_len; 3307 physical += map->stripe_len;
2570 logical += increment; 3308 logical += increment;
@@ -2605,6 +3343,7 @@ out:
2605 3343
2606 blk_finish_plug(&plug); 3344 blk_finish_plug(&plug);
2607 btrfs_free_path(path); 3345 btrfs_free_path(path);
3346 btrfs_free_path(ppath);
2608 return ret < 0 ? ret : 0; 3347 return ret < 0 ? ret : 0;
2609} 3348}
2610 3349
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index ff2b35114972..0144790e296e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4879,13 +4879,15 @@ static inline int parity_smaller(u64 a, u64 b)
4879static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map) 4879static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
4880{ 4880{
4881 struct btrfs_bio_stripe s; 4881 struct btrfs_bio_stripe s;
4882 int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
4882 int i; 4883 int i;
4883 u64 l; 4884 u64 l;
4884 int again = 1; 4885 int again = 1;
4886 int m;
4885 4887
4886 while (again) { 4888 while (again) {
4887 again = 0; 4889 again = 0;
4888 for (i = 0; i < bbio->num_stripes - 1; i++) { 4890 for (i = 0; i < real_stripes - 1; i++) {
4889 if (parity_smaller(raid_map[i], raid_map[i+1])) { 4891 if (parity_smaller(raid_map[i], raid_map[i+1])) {
4890 s = bbio->stripes[i]; 4892 s = bbio->stripes[i];
4891 l = raid_map[i]; 4893 l = raid_map[i];
@@ -4893,6 +4895,14 @@ static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
4893 raid_map[i] = raid_map[i+1]; 4895 raid_map[i] = raid_map[i+1];
4894 bbio->stripes[i+1] = s; 4896 bbio->stripes[i+1] = s;
4895 raid_map[i+1] = l; 4897 raid_map[i+1] = l;
4898
4899 if (bbio->tgtdev_map) {
4900 m = bbio->tgtdev_map[i];
4901 bbio->tgtdev_map[i] =
4902 bbio->tgtdev_map[i + 1];
4903 bbio->tgtdev_map[i + 1] = m;
4904 }
4905
4896 again = 1; 4906 again = 1;
4897 } 4907 }
4898 } 4908 }
@@ -4921,6 +4931,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4921 int ret = 0; 4931 int ret = 0;
4922 int num_stripes; 4932 int num_stripes;
4923 int max_errors = 0; 4933 int max_errors = 0;
4934 int tgtdev_indexes = 0;
4924 struct btrfs_bio *bbio = NULL; 4935 struct btrfs_bio *bbio = NULL;
4925 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 4936 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
4926 int dev_replace_is_ongoing = 0; 4937 int dev_replace_is_ongoing = 0;
@@ -5159,15 +5170,14 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5159 BTRFS_BLOCK_GROUP_RAID6)) { 5170 BTRFS_BLOCK_GROUP_RAID6)) {
5160 u64 tmp; 5171 u64 tmp;
5161 5172
5162 if (bbio_ret && ((rw & REQ_WRITE) || mirror_num > 1) 5173 if (raid_map_ret &&
5163 && raid_map_ret) { 5174 ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
5175 mirror_num > 1)) {
5164 int i, rot; 5176 int i, rot;
5165 5177
5166 /* push stripe_nr back to the start of the full stripe */ 5178 /* push stripe_nr back to the start of the full stripe */
5167 stripe_nr = raid56_full_stripe_start; 5179 stripe_nr = raid56_full_stripe_start;
5168 do_div(stripe_nr, stripe_len); 5180 do_div(stripe_nr, stripe_len * nr_data_stripes(map));
5169
5170 stripe_index = do_div(stripe_nr, nr_data_stripes(map));
5171 5181
5172 /* RAID[56] write or recovery. Return all stripes */ 5182 /* RAID[56] write or recovery. Return all stripes */
5173 num_stripes = map->num_stripes; 5183 num_stripes = map->num_stripes;
@@ -5233,14 +5243,19 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5233 num_alloc_stripes <<= 1; 5243 num_alloc_stripes <<= 1;
5234 if (rw & REQ_GET_READ_MIRRORS) 5244 if (rw & REQ_GET_READ_MIRRORS)
5235 num_alloc_stripes++; 5245 num_alloc_stripes++;
5246 tgtdev_indexes = num_stripes;
5236 } 5247 }
5237 bbio = kzalloc(btrfs_bio_size(num_alloc_stripes), GFP_NOFS); 5248
5249 bbio = kzalloc(btrfs_bio_size(num_alloc_stripes, tgtdev_indexes),
5250 GFP_NOFS);
5238 if (!bbio) { 5251 if (!bbio) {
5239 kfree(raid_map); 5252 kfree(raid_map);
5240 ret = -ENOMEM; 5253 ret = -ENOMEM;
5241 goto out; 5254 goto out;
5242 } 5255 }
5243 atomic_set(&bbio->error, 0); 5256 atomic_set(&bbio->error, 0);
5257 if (dev_replace_is_ongoing)
5258 bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes);
5244 5259
5245 if (rw & REQ_DISCARD) { 5260 if (rw & REQ_DISCARD) {
5246 int factor = 0; 5261 int factor = 0;
@@ -5325,6 +5340,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5325 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) 5340 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
5326 max_errors = btrfs_chunk_max_errors(map); 5341 max_errors = btrfs_chunk_max_errors(map);
5327 5342
5343 tgtdev_indexes = 0;
5328 if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) && 5344 if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) &&
5329 dev_replace->tgtdev != NULL) { 5345 dev_replace->tgtdev != NULL) {
5330 int index_where_to_add; 5346 int index_where_to_add;
@@ -5353,8 +5369,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5353 new->physical = old->physical; 5369 new->physical = old->physical;
5354 new->length = old->length; 5370 new->length = old->length;
5355 new->dev = dev_replace->tgtdev; 5371 new->dev = dev_replace->tgtdev;
5372 bbio->tgtdev_map[i] = index_where_to_add;
5356 index_where_to_add++; 5373 index_where_to_add++;
5357 max_errors++; 5374 max_errors++;
5375 tgtdev_indexes++;
5358 } 5376 }
5359 } 5377 }
5360 num_stripes = index_where_to_add; 5378 num_stripes = index_where_to_add;
@@ -5400,7 +5418,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5400 tgtdev_stripe->length = 5418 tgtdev_stripe->length =
5401 bbio->stripes[index_srcdev].length; 5419 bbio->stripes[index_srcdev].length;
5402 tgtdev_stripe->dev = dev_replace->tgtdev; 5420 tgtdev_stripe->dev = dev_replace->tgtdev;
5421 bbio->tgtdev_map[index_srcdev] = num_stripes;
5403 5422
5423 tgtdev_indexes++;
5404 num_stripes++; 5424 num_stripes++;
5405 } 5425 }
5406 } 5426 }
@@ -5410,6 +5430,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5410 bbio->num_stripes = num_stripes; 5430 bbio->num_stripes = num_stripes;
5411 bbio->max_errors = max_errors; 5431 bbio->max_errors = max_errors;
5412 bbio->mirror_num = mirror_num; 5432 bbio->mirror_num = mirror_num;
5433 bbio->num_tgtdevs = tgtdev_indexes;
5413 5434
5414 /* 5435 /*
5415 * this is the case that REQ_READ && dev_replace_is_ongoing && 5436 * this is the case that REQ_READ && dev_replace_is_ongoing &&
@@ -5441,6 +5462,16 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5441 mirror_num, NULL); 5462 mirror_num, NULL);
5442} 5463}
5443 5464
5465/* For Scrub/replace */
5466int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw,
5467 u64 logical, u64 *length,
5468 struct btrfs_bio **bbio_ret, int mirror_num,
5469 u64 **raid_map_ret)
5470{
5471 return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
5472 mirror_num, raid_map_ret);
5473}
5474
5444int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 5475int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
5445 u64 chunk_start, u64 physical, u64 devid, 5476 u64 chunk_start, u64 physical, u64 devid,
5446 u64 **logical, int *naddrs, int *stripe_len) 5477 u64 **logical, int *naddrs, int *stripe_len)
@@ -5810,12 +5841,9 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
5810 } else { 5841 } else {
5811 ret = raid56_parity_recover(root, bio, bbio, 5842 ret = raid56_parity_recover(root, bio, bbio,
5812 raid_map, map_length, 5843 raid_map, map_length,
5813 mirror_num); 5844 mirror_num, 1);
5814 } 5845 }
5815 /* 5846
5816 * FIXME, replace dosen't support raid56 yet, please fix
5817 * it in the future.
5818 */
5819 btrfs_bio_counter_dec(root->fs_info); 5847 btrfs_bio_counter_dec(root->fs_info);
5820 return ret; 5848 return ret;
5821 } 5849 }
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 637bcfadadb2..d6fe73c0f4a2 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -292,7 +292,7 @@ struct btrfs_bio_stripe {
292struct btrfs_bio; 292struct btrfs_bio;
293typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err); 293typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
294 294
295#define BTRFS_BIO_ORIG_BIO_SUBMITTED 0x1 295#define BTRFS_BIO_ORIG_BIO_SUBMITTED (1 << 0)
296 296
297struct btrfs_bio { 297struct btrfs_bio {
298 atomic_t stripes_pending; 298 atomic_t stripes_pending;
@@ -305,6 +305,8 @@ struct btrfs_bio {
305 int max_errors; 305 int max_errors;
306 int num_stripes; 306 int num_stripes;
307 int mirror_num; 307 int mirror_num;
308 int num_tgtdevs;
309 int *tgtdev_map;
308 struct btrfs_bio_stripe stripes[]; 310 struct btrfs_bio_stripe stripes[];
309}; 311};
310 312
@@ -387,12 +389,18 @@ struct btrfs_balance_control {
387int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, 389int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
388 u64 end, u64 *length); 390 u64 end, u64 *length);
389 391
390#define btrfs_bio_size(n) (sizeof(struct btrfs_bio) + \ 392#define btrfs_bio_size(total_stripes, real_stripes) \
391 (sizeof(struct btrfs_bio_stripe) * (n))) 393 (sizeof(struct btrfs_bio) + \
394 (sizeof(struct btrfs_bio_stripe) * (total_stripes)) + \
395 (sizeof(int) * (real_stripes)))
392 396
393int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, 397int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
394 u64 logical, u64 *length, 398 u64 logical, u64 *length,
395 struct btrfs_bio **bbio_ret, int mirror_num); 399 struct btrfs_bio **bbio_ret, int mirror_num);
400int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw,
401 u64 logical, u64 *length,
402 struct btrfs_bio **bbio_ret, int mirror_num,
403 u64 **raid_map_ret);
396int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 404int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
397 u64 chunk_start, u64 physical, u64 devid, 405 u64 chunk_start, u64 physical, u64 devid,
398 u64 **logical, int *naddrs, int *stripe_len); 406 u64 **logical, int *naddrs, int *stripe_len);