diff options
Diffstat (limited to 'fs/btrfs/raid56.c')
-rw-r--r-- | fs/btrfs/raid56.c | 763 |
1 files changed, 687 insertions, 76 deletions
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 6a41631cb959..8ab2a17bbba8 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c | |||
@@ -58,9 +58,23 @@ | |||
58 | */ | 58 | */ |
59 | #define RBIO_CACHE_READY_BIT 3 | 59 | #define RBIO_CACHE_READY_BIT 3 |
60 | 60 | ||
61 | /* | ||
62 | * bbio and raid_map is managed by the caller, so we shouldn't free | ||
63 | * them here. And besides that, all rbios with this flag should not | ||
64 | * be cached, because we need raid_map to check the rbios' stripe | ||
65 | * is the same or not, but it is very likely that the caller has | ||
66 | * free raid_map, so don't cache those rbios. | ||
67 | */ | ||
68 | #define RBIO_HOLD_BBIO_MAP_BIT 4 | ||
61 | 69 | ||
62 | #define RBIO_CACHE_SIZE 1024 | 70 | #define RBIO_CACHE_SIZE 1024 |
63 | 71 | ||
72 | enum btrfs_rbio_ops { | ||
73 | BTRFS_RBIO_WRITE = 0, | ||
74 | BTRFS_RBIO_READ_REBUILD = 1, | ||
75 | BTRFS_RBIO_PARITY_SCRUB = 2, | ||
76 | }; | ||
77 | |||
64 | struct btrfs_raid_bio { | 78 | struct btrfs_raid_bio { |
65 | struct btrfs_fs_info *fs_info; | 79 | struct btrfs_fs_info *fs_info; |
66 | struct btrfs_bio *bbio; | 80 | struct btrfs_bio *bbio; |
@@ -117,13 +131,16 @@ struct btrfs_raid_bio { | |||
117 | /* number of data stripes (no p/q) */ | 131 | /* number of data stripes (no p/q) */ |
118 | int nr_data; | 132 | int nr_data; |
119 | 133 | ||
134 | int real_stripes; | ||
135 | |||
136 | int stripe_npages; | ||
120 | /* | 137 | /* |
121 | * set if we're doing a parity rebuild | 138 | * set if we're doing a parity rebuild |
122 | * for a read from higher up, which is handled | 139 | * for a read from higher up, which is handled |
123 | * differently from a parity rebuild as part of | 140 | * differently from a parity rebuild as part of |
124 | * rmw | 141 | * rmw |
125 | */ | 142 | */ |
126 | int read_rebuild; | 143 | enum btrfs_rbio_ops operation; |
127 | 144 | ||
128 | /* first bad stripe */ | 145 | /* first bad stripe */ |
129 | int faila; | 146 | int faila; |
@@ -131,6 +148,7 @@ struct btrfs_raid_bio { | |||
131 | /* second bad stripe (for raid6 use) */ | 148 | /* second bad stripe (for raid6 use) */ |
132 | int failb; | 149 | int failb; |
133 | 150 | ||
151 | int scrubp; | ||
134 | /* | 152 | /* |
135 | * number of pages needed to represent the full | 153 | * number of pages needed to represent the full |
136 | * stripe | 154 | * stripe |
@@ -144,8 +162,13 @@ struct btrfs_raid_bio { | |||
144 | */ | 162 | */ |
145 | int bio_list_bytes; | 163 | int bio_list_bytes; |
146 | 164 | ||
165 | int generic_bio_cnt; | ||
166 | |||
147 | atomic_t refs; | 167 | atomic_t refs; |
148 | 168 | ||
169 | atomic_t stripes_pending; | ||
170 | |||
171 | atomic_t error; | ||
149 | /* | 172 | /* |
150 | * these are two arrays of pointers. We allocate the | 173 | * these are two arrays of pointers. We allocate the |
151 | * rbio big enough to hold them both and setup their | 174 | * rbio big enough to hold them both and setup their |
@@ -162,6 +185,11 @@ struct btrfs_raid_bio { | |||
162 | * here for faster lookup | 185 | * here for faster lookup |
163 | */ | 186 | */ |
164 | struct page **bio_pages; | 187 | struct page **bio_pages; |
188 | |||
189 | /* | ||
190 | * bitmap to record which horizontal stripe has data | ||
191 | */ | ||
192 | unsigned long *dbitmap; | ||
165 | }; | 193 | }; |
166 | 194 | ||
167 | static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); | 195 | static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); |
@@ -176,6 +204,10 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio); | |||
176 | static void index_rbio_pages(struct btrfs_raid_bio *rbio); | 204 | static void index_rbio_pages(struct btrfs_raid_bio *rbio); |
177 | static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); | 205 | static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); |
178 | 206 | ||
207 | static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, | ||
208 | int need_check); | ||
209 | static void async_scrub_parity(struct btrfs_raid_bio *rbio); | ||
210 | |||
179 | /* | 211 | /* |
180 | * the stripe hash table is used for locking, and to collect | 212 | * the stripe hash table is used for locking, and to collect |
181 | * bios in hopes of making a full stripe | 213 | * bios in hopes of making a full stripe |
@@ -324,6 +356,7 @@ static void merge_rbio(struct btrfs_raid_bio *dest, | |||
324 | { | 356 | { |
325 | bio_list_merge(&dest->bio_list, &victim->bio_list); | 357 | bio_list_merge(&dest->bio_list, &victim->bio_list); |
326 | dest->bio_list_bytes += victim->bio_list_bytes; | 358 | dest->bio_list_bytes += victim->bio_list_bytes; |
359 | dest->generic_bio_cnt += victim->generic_bio_cnt; | ||
327 | bio_list_init(&victim->bio_list); | 360 | bio_list_init(&victim->bio_list); |
328 | } | 361 | } |
329 | 362 | ||
@@ -577,11 +610,20 @@ static int rbio_can_merge(struct btrfs_raid_bio *last, | |||
577 | cur->raid_map[0]) | 610 | cur->raid_map[0]) |
578 | return 0; | 611 | return 0; |
579 | 612 | ||
580 | /* reads can't merge with writes */ | 613 | /* we can't merge with different operations */ |
581 | if (last->read_rebuild != | 614 | if (last->operation != cur->operation) |
582 | cur->read_rebuild) { | 615 | return 0; |
616 | /* | ||
617 | * We've need read the full stripe from the drive. | ||
618 | * check and repair the parity and write the new results. | ||
619 | * | ||
620 | * We're not allowed to add any new bios to the | ||
621 | * bio list here, anyone else that wants to | ||
622 | * change this stripe needs to do their own rmw. | ||
623 | */ | ||
624 | if (last->operation == BTRFS_RBIO_PARITY_SCRUB || | ||
625 | cur->operation == BTRFS_RBIO_PARITY_SCRUB) | ||
583 | return 0; | 626 | return 0; |
584 | } | ||
585 | 627 | ||
586 | return 1; | 628 | return 1; |
587 | } | 629 | } |
@@ -601,7 +643,7 @@ static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index) | |||
601 | */ | 643 | */ |
602 | static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) | 644 | static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) |
603 | { | 645 | { |
604 | if (rbio->nr_data + 1 == rbio->bbio->num_stripes) | 646 | if (rbio->nr_data + 1 == rbio->real_stripes) |
605 | return NULL; | 647 | return NULL; |
606 | 648 | ||
607 | index += ((rbio->nr_data + 1) * rbio->stripe_len) >> | 649 | index += ((rbio->nr_data + 1) * rbio->stripe_len) >> |
@@ -772,11 +814,14 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) | |||
772 | spin_unlock(&rbio->bio_list_lock); | 814 | spin_unlock(&rbio->bio_list_lock); |
773 | spin_unlock_irqrestore(&h->lock, flags); | 815 | spin_unlock_irqrestore(&h->lock, flags); |
774 | 816 | ||
775 | if (next->read_rebuild) | 817 | if (next->operation == BTRFS_RBIO_READ_REBUILD) |
776 | async_read_rebuild(next); | 818 | async_read_rebuild(next); |
777 | else { | 819 | else if (next->operation == BTRFS_RBIO_WRITE) { |
778 | steal_rbio(rbio, next); | 820 | steal_rbio(rbio, next); |
779 | async_rmw_stripe(next); | 821 | async_rmw_stripe(next); |
822 | } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) { | ||
823 | steal_rbio(rbio, next); | ||
824 | async_scrub_parity(next); | ||
780 | } | 825 | } |
781 | 826 | ||
782 | goto done_nolock; | 827 | goto done_nolock; |
@@ -796,6 +841,21 @@ done_nolock: | |||
796 | remove_rbio_from_cache(rbio); | 841 | remove_rbio_from_cache(rbio); |
797 | } | 842 | } |
798 | 843 | ||
844 | static inline void | ||
845 | __free_bbio_and_raid_map(struct btrfs_bio *bbio, u64 *raid_map, int need) | ||
846 | { | ||
847 | if (need) { | ||
848 | kfree(raid_map); | ||
849 | kfree(bbio); | ||
850 | } | ||
851 | } | ||
852 | |||
853 | static inline void free_bbio_and_raid_map(struct btrfs_raid_bio *rbio) | ||
854 | { | ||
855 | __free_bbio_and_raid_map(rbio->bbio, rbio->raid_map, | ||
856 | !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags)); | ||
857 | } | ||
858 | |||
799 | static void __free_raid_bio(struct btrfs_raid_bio *rbio) | 859 | static void __free_raid_bio(struct btrfs_raid_bio *rbio) |
800 | { | 860 | { |
801 | int i; | 861 | int i; |
@@ -814,8 +874,9 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio) | |||
814 | rbio->stripe_pages[i] = NULL; | 874 | rbio->stripe_pages[i] = NULL; |
815 | } | 875 | } |
816 | } | 876 | } |
817 | kfree(rbio->raid_map); | 877 | |
818 | kfree(rbio->bbio); | 878 | free_bbio_and_raid_map(rbio); |
879 | |||
819 | kfree(rbio); | 880 | kfree(rbio); |
820 | } | 881 | } |
821 | 882 | ||
@@ -833,6 +894,10 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate) | |||
833 | { | 894 | { |
834 | struct bio *cur = bio_list_get(&rbio->bio_list); | 895 | struct bio *cur = bio_list_get(&rbio->bio_list); |
835 | struct bio *next; | 896 | struct bio *next; |
897 | |||
898 | if (rbio->generic_bio_cnt) | ||
899 | btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt); | ||
900 | |||
836 | free_raid_bio(rbio); | 901 | free_raid_bio(rbio); |
837 | 902 | ||
838 | while (cur) { | 903 | while (cur) { |
@@ -858,13 +923,13 @@ static void raid_write_end_io(struct bio *bio, int err) | |||
858 | 923 | ||
859 | bio_put(bio); | 924 | bio_put(bio); |
860 | 925 | ||
861 | if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) | 926 | if (!atomic_dec_and_test(&rbio->stripes_pending)) |
862 | return; | 927 | return; |
863 | 928 | ||
864 | err = 0; | 929 | err = 0; |
865 | 930 | ||
866 | /* OK, we have read all the stripes we need to. */ | 931 | /* OK, we have read all the stripes we need to. */ |
867 | if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) | 932 | if (atomic_read(&rbio->error) > rbio->bbio->max_errors) |
868 | err = -EIO; | 933 | err = -EIO; |
869 | 934 | ||
870 | rbio_orig_end_io(rbio, err, 0); | 935 | rbio_orig_end_io(rbio, err, 0); |
@@ -925,16 +990,16 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, | |||
925 | { | 990 | { |
926 | struct btrfs_raid_bio *rbio; | 991 | struct btrfs_raid_bio *rbio; |
927 | int nr_data = 0; | 992 | int nr_data = 0; |
928 | int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes); | 993 | int real_stripes = bbio->num_stripes - bbio->num_tgtdevs; |
994 | int num_pages = rbio_nr_pages(stripe_len, real_stripes); | ||
995 | int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE); | ||
929 | void *p; | 996 | void *p; |
930 | 997 | ||
931 | rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2, | 998 | rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2 + |
999 | DIV_ROUND_UP(stripe_npages, BITS_PER_LONG / 8), | ||
932 | GFP_NOFS); | 1000 | GFP_NOFS); |
933 | if (!rbio) { | 1001 | if (!rbio) |
934 | kfree(raid_map); | ||
935 | kfree(bbio); | ||
936 | return ERR_PTR(-ENOMEM); | 1002 | return ERR_PTR(-ENOMEM); |
937 | } | ||
938 | 1003 | ||
939 | bio_list_init(&rbio->bio_list); | 1004 | bio_list_init(&rbio->bio_list); |
940 | INIT_LIST_HEAD(&rbio->plug_list); | 1005 | INIT_LIST_HEAD(&rbio->plug_list); |
@@ -946,9 +1011,13 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, | |||
946 | rbio->fs_info = root->fs_info; | 1011 | rbio->fs_info = root->fs_info; |
947 | rbio->stripe_len = stripe_len; | 1012 | rbio->stripe_len = stripe_len; |
948 | rbio->nr_pages = num_pages; | 1013 | rbio->nr_pages = num_pages; |
1014 | rbio->real_stripes = real_stripes; | ||
1015 | rbio->stripe_npages = stripe_npages; | ||
949 | rbio->faila = -1; | 1016 | rbio->faila = -1; |
950 | rbio->failb = -1; | 1017 | rbio->failb = -1; |
951 | atomic_set(&rbio->refs, 1); | 1018 | atomic_set(&rbio->refs, 1); |
1019 | atomic_set(&rbio->error, 0); | ||
1020 | atomic_set(&rbio->stripes_pending, 0); | ||
952 | 1021 | ||
953 | /* | 1022 | /* |
954 | * the stripe_pages and bio_pages array point to the extra | 1023 | * the stripe_pages and bio_pages array point to the extra |
@@ -957,11 +1026,12 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, | |||
957 | p = rbio + 1; | 1026 | p = rbio + 1; |
958 | rbio->stripe_pages = p; | 1027 | rbio->stripe_pages = p; |
959 | rbio->bio_pages = p + sizeof(struct page *) * num_pages; | 1028 | rbio->bio_pages = p + sizeof(struct page *) * num_pages; |
1029 | rbio->dbitmap = p + sizeof(struct page *) * num_pages * 2; | ||
960 | 1030 | ||
961 | if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE) | 1031 | if (raid_map[real_stripes - 1] == RAID6_Q_STRIPE) |
962 | nr_data = bbio->num_stripes - 2; | 1032 | nr_data = real_stripes - 2; |
963 | else | 1033 | else |
964 | nr_data = bbio->num_stripes - 1; | 1034 | nr_data = real_stripes - 1; |
965 | 1035 | ||
966 | rbio->nr_data = nr_data; | 1036 | rbio->nr_data = nr_data; |
967 | return rbio; | 1037 | return rbio; |
@@ -1073,7 +1143,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, | |||
1073 | static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) | 1143 | static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) |
1074 | { | 1144 | { |
1075 | if (rbio->faila >= 0 || rbio->failb >= 0) { | 1145 | if (rbio->faila >= 0 || rbio->failb >= 0) { |
1076 | BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1); | 1146 | BUG_ON(rbio->faila == rbio->real_stripes - 1); |
1077 | __raid56_parity_recover(rbio); | 1147 | __raid56_parity_recover(rbio); |
1078 | } else { | 1148 | } else { |
1079 | finish_rmw(rbio); | 1149 | finish_rmw(rbio); |
@@ -1134,7 +1204,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio) | |||
1134 | static noinline void finish_rmw(struct btrfs_raid_bio *rbio) | 1204 | static noinline void finish_rmw(struct btrfs_raid_bio *rbio) |
1135 | { | 1205 | { |
1136 | struct btrfs_bio *bbio = rbio->bbio; | 1206 | struct btrfs_bio *bbio = rbio->bbio; |
1137 | void *pointers[bbio->num_stripes]; | 1207 | void *pointers[rbio->real_stripes]; |
1138 | int stripe_len = rbio->stripe_len; | 1208 | int stripe_len = rbio->stripe_len; |
1139 | int nr_data = rbio->nr_data; | 1209 | int nr_data = rbio->nr_data; |
1140 | int stripe; | 1210 | int stripe; |
@@ -1148,11 +1218,11 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) | |||
1148 | 1218 | ||
1149 | bio_list_init(&bio_list); | 1219 | bio_list_init(&bio_list); |
1150 | 1220 | ||
1151 | if (bbio->num_stripes - rbio->nr_data == 1) { | 1221 | if (rbio->real_stripes - rbio->nr_data == 1) { |
1152 | p_stripe = bbio->num_stripes - 1; | 1222 | p_stripe = rbio->real_stripes - 1; |
1153 | } else if (bbio->num_stripes - rbio->nr_data == 2) { | 1223 | } else if (rbio->real_stripes - rbio->nr_data == 2) { |
1154 | p_stripe = bbio->num_stripes - 2; | 1224 | p_stripe = rbio->real_stripes - 2; |
1155 | q_stripe = bbio->num_stripes - 1; | 1225 | q_stripe = rbio->real_stripes - 1; |
1156 | } else { | 1226 | } else { |
1157 | BUG(); | 1227 | BUG(); |
1158 | } | 1228 | } |
@@ -1169,7 +1239,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) | |||
1169 | set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); | 1239 | set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); |
1170 | spin_unlock_irq(&rbio->bio_list_lock); | 1240 | spin_unlock_irq(&rbio->bio_list_lock); |
1171 | 1241 | ||
1172 | atomic_set(&rbio->bbio->error, 0); | 1242 | atomic_set(&rbio->error, 0); |
1173 | 1243 | ||
1174 | /* | 1244 | /* |
1175 | * now that we've set rmw_locked, run through the | 1245 | * now that we've set rmw_locked, run through the |
@@ -1209,7 +1279,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) | |||
1209 | SetPageUptodate(p); | 1279 | SetPageUptodate(p); |
1210 | pointers[stripe++] = kmap(p); | 1280 | pointers[stripe++] = kmap(p); |
1211 | 1281 | ||
1212 | raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE, | 1282 | raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE, |
1213 | pointers); | 1283 | pointers); |
1214 | } else { | 1284 | } else { |
1215 | /* raid5 */ | 1285 | /* raid5 */ |
@@ -1218,7 +1288,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) | |||
1218 | } | 1288 | } |
1219 | 1289 | ||
1220 | 1290 | ||
1221 | for (stripe = 0; stripe < bbio->num_stripes; stripe++) | 1291 | for (stripe = 0; stripe < rbio->real_stripes; stripe++) |
1222 | kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); | 1292 | kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); |
1223 | } | 1293 | } |
1224 | 1294 | ||
@@ -1227,7 +1297,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) | |||
1227 | * higher layers (the bio_list in our rbio) and our p/q. Ignore | 1297 | * higher layers (the bio_list in our rbio) and our p/q. Ignore |
1228 | * everything else. | 1298 | * everything else. |
1229 | */ | 1299 | */ |
1230 | for (stripe = 0; stripe < bbio->num_stripes; stripe++) { | 1300 | for (stripe = 0; stripe < rbio->real_stripes; stripe++) { |
1231 | for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { | 1301 | for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { |
1232 | struct page *page; | 1302 | struct page *page; |
1233 | if (stripe < rbio->nr_data) { | 1303 | if (stripe < rbio->nr_data) { |
@@ -1245,8 +1315,34 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) | |||
1245 | } | 1315 | } |
1246 | } | 1316 | } |
1247 | 1317 | ||
1248 | atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list)); | 1318 | if (likely(!bbio->num_tgtdevs)) |
1249 | BUG_ON(atomic_read(&bbio->stripes_pending) == 0); | 1319 | goto write_data; |
1320 | |||
1321 | for (stripe = 0; stripe < rbio->real_stripes; stripe++) { | ||
1322 | if (!bbio->tgtdev_map[stripe]) | ||
1323 | continue; | ||
1324 | |||
1325 | for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { | ||
1326 | struct page *page; | ||
1327 | if (stripe < rbio->nr_data) { | ||
1328 | page = page_in_rbio(rbio, stripe, pagenr, 1); | ||
1329 | if (!page) | ||
1330 | continue; | ||
1331 | } else { | ||
1332 | page = rbio_stripe_page(rbio, stripe, pagenr); | ||
1333 | } | ||
1334 | |||
1335 | ret = rbio_add_io_page(rbio, &bio_list, page, | ||
1336 | rbio->bbio->tgtdev_map[stripe], | ||
1337 | pagenr, rbio->stripe_len); | ||
1338 | if (ret) | ||
1339 | goto cleanup; | ||
1340 | } | ||
1341 | } | ||
1342 | |||
1343 | write_data: | ||
1344 | atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list)); | ||
1345 | BUG_ON(atomic_read(&rbio->stripes_pending) == 0); | ||
1250 | 1346 | ||
1251 | while (1) { | 1347 | while (1) { |
1252 | bio = bio_list_pop(&bio_list); | 1348 | bio = bio_list_pop(&bio_list); |
@@ -1283,7 +1379,8 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio, | |||
1283 | stripe = &rbio->bbio->stripes[i]; | 1379 | stripe = &rbio->bbio->stripes[i]; |
1284 | stripe_start = stripe->physical; | 1380 | stripe_start = stripe->physical; |
1285 | if (physical >= stripe_start && | 1381 | if (physical >= stripe_start && |
1286 | physical < stripe_start + rbio->stripe_len) { | 1382 | physical < stripe_start + rbio->stripe_len && |
1383 | bio->bi_bdev == stripe->dev->bdev) { | ||
1287 | return i; | 1384 | return i; |
1288 | } | 1385 | } |
1289 | } | 1386 | } |
@@ -1331,11 +1428,11 @@ static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed) | |||
1331 | if (rbio->faila == -1) { | 1428 | if (rbio->faila == -1) { |
1332 | /* first failure on this rbio */ | 1429 | /* first failure on this rbio */ |
1333 | rbio->faila = failed; | 1430 | rbio->faila = failed; |
1334 | atomic_inc(&rbio->bbio->error); | 1431 | atomic_inc(&rbio->error); |
1335 | } else if (rbio->failb == -1) { | 1432 | } else if (rbio->failb == -1) { |
1336 | /* second failure on this rbio */ | 1433 | /* second failure on this rbio */ |
1337 | rbio->failb = failed; | 1434 | rbio->failb = failed; |
1338 | atomic_inc(&rbio->bbio->error); | 1435 | atomic_inc(&rbio->error); |
1339 | } else { | 1436 | } else { |
1340 | ret = -EIO; | 1437 | ret = -EIO; |
1341 | } | 1438 | } |
@@ -1394,11 +1491,11 @@ static void raid_rmw_end_io(struct bio *bio, int err) | |||
1394 | 1491 | ||
1395 | bio_put(bio); | 1492 | bio_put(bio); |
1396 | 1493 | ||
1397 | if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) | 1494 | if (!atomic_dec_and_test(&rbio->stripes_pending)) |
1398 | return; | 1495 | return; |
1399 | 1496 | ||
1400 | err = 0; | 1497 | err = 0; |
1401 | if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) | 1498 | if (atomic_read(&rbio->error) > rbio->bbio->max_errors) |
1402 | goto cleanup; | 1499 | goto cleanup; |
1403 | 1500 | ||
1404 | /* | 1501 | /* |
@@ -1439,7 +1536,6 @@ static void async_read_rebuild(struct btrfs_raid_bio *rbio) | |||
1439 | static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) | 1536 | static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) |
1440 | { | 1537 | { |
1441 | int bios_to_read = 0; | 1538 | int bios_to_read = 0; |
1442 | struct btrfs_bio *bbio = rbio->bbio; | ||
1443 | struct bio_list bio_list; | 1539 | struct bio_list bio_list; |
1444 | int ret; | 1540 | int ret; |
1445 | int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); | 1541 | int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); |
@@ -1455,7 +1551,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) | |||
1455 | 1551 | ||
1456 | index_rbio_pages(rbio); | 1552 | index_rbio_pages(rbio); |
1457 | 1553 | ||
1458 | atomic_set(&rbio->bbio->error, 0); | 1554 | atomic_set(&rbio->error, 0); |
1459 | /* | 1555 | /* |
1460 | * build a list of bios to read all the missing parts of this | 1556 | * build a list of bios to read all the missing parts of this |
1461 | * stripe | 1557 | * stripe |
@@ -1503,7 +1599,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) | |||
1503 | * the bbio may be freed once we submit the last bio. Make sure | 1599 | * the bbio may be freed once we submit the last bio. Make sure |
1504 | * not to touch it after that | 1600 | * not to touch it after that |
1505 | */ | 1601 | */ |
1506 | atomic_set(&bbio->stripes_pending, bios_to_read); | 1602 | atomic_set(&rbio->stripes_pending, bios_to_read); |
1507 | while (1) { | 1603 | while (1) { |
1508 | bio = bio_list_pop(&bio_list); | 1604 | bio = bio_list_pop(&bio_list); |
1509 | if (!bio) | 1605 | if (!bio) |
@@ -1686,19 +1782,30 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio, | |||
1686 | struct btrfs_raid_bio *rbio; | 1782 | struct btrfs_raid_bio *rbio; |
1687 | struct btrfs_plug_cb *plug = NULL; | 1783 | struct btrfs_plug_cb *plug = NULL; |
1688 | struct blk_plug_cb *cb; | 1784 | struct blk_plug_cb *cb; |
1785 | int ret; | ||
1689 | 1786 | ||
1690 | rbio = alloc_rbio(root, bbio, raid_map, stripe_len); | 1787 | rbio = alloc_rbio(root, bbio, raid_map, stripe_len); |
1691 | if (IS_ERR(rbio)) | 1788 | if (IS_ERR(rbio)) { |
1789 | __free_bbio_and_raid_map(bbio, raid_map, 1); | ||
1692 | return PTR_ERR(rbio); | 1790 | return PTR_ERR(rbio); |
1791 | } | ||
1693 | bio_list_add(&rbio->bio_list, bio); | 1792 | bio_list_add(&rbio->bio_list, bio); |
1694 | rbio->bio_list_bytes = bio->bi_iter.bi_size; | 1793 | rbio->bio_list_bytes = bio->bi_iter.bi_size; |
1794 | rbio->operation = BTRFS_RBIO_WRITE; | ||
1795 | |||
1796 | btrfs_bio_counter_inc_noblocked(root->fs_info); | ||
1797 | rbio->generic_bio_cnt = 1; | ||
1695 | 1798 | ||
1696 | /* | 1799 | /* |
1697 | * don't plug on full rbios, just get them out the door | 1800 | * don't plug on full rbios, just get them out the door |
1698 | * as quickly as we can | 1801 | * as quickly as we can |
1699 | */ | 1802 | */ |
1700 | if (rbio_is_full(rbio)) | 1803 | if (rbio_is_full(rbio)) { |
1701 | return full_stripe_write(rbio); | 1804 | ret = full_stripe_write(rbio); |
1805 | if (ret) | ||
1806 | btrfs_bio_counter_dec(root->fs_info); | ||
1807 | return ret; | ||
1808 | } | ||
1702 | 1809 | ||
1703 | cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info, | 1810 | cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info, |
1704 | sizeof(*plug)); | 1811 | sizeof(*plug)); |
@@ -1709,10 +1816,13 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio, | |||
1709 | INIT_LIST_HEAD(&plug->rbio_list); | 1816 | INIT_LIST_HEAD(&plug->rbio_list); |
1710 | } | 1817 | } |
1711 | list_add_tail(&rbio->plug_list, &plug->rbio_list); | 1818 | list_add_tail(&rbio->plug_list, &plug->rbio_list); |
1819 | ret = 0; | ||
1712 | } else { | 1820 | } else { |
1713 | return __raid56_parity_write(rbio); | 1821 | ret = __raid56_parity_write(rbio); |
1822 | if (ret) | ||
1823 | btrfs_bio_counter_dec(root->fs_info); | ||
1714 | } | 1824 | } |
1715 | return 0; | 1825 | return ret; |
1716 | } | 1826 | } |
1717 | 1827 | ||
1718 | /* | 1828 | /* |
@@ -1730,7 +1840,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) | |||
1730 | int err; | 1840 | int err; |
1731 | int i; | 1841 | int i; |
1732 | 1842 | ||
1733 | pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *), | 1843 | pointers = kzalloc(rbio->real_stripes * sizeof(void *), |
1734 | GFP_NOFS); | 1844 | GFP_NOFS); |
1735 | if (!pointers) { | 1845 | if (!pointers) { |
1736 | err = -ENOMEM; | 1846 | err = -ENOMEM; |
@@ -1740,7 +1850,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) | |||
1740 | faila = rbio->faila; | 1850 | faila = rbio->faila; |
1741 | failb = rbio->failb; | 1851 | failb = rbio->failb; |
1742 | 1852 | ||
1743 | if (rbio->read_rebuild) { | 1853 | if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { |
1744 | spin_lock_irq(&rbio->bio_list_lock); | 1854 | spin_lock_irq(&rbio->bio_list_lock); |
1745 | set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); | 1855 | set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); |
1746 | spin_unlock_irq(&rbio->bio_list_lock); | 1856 | spin_unlock_irq(&rbio->bio_list_lock); |
@@ -1749,15 +1859,23 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) | |||
1749 | index_rbio_pages(rbio); | 1859 | index_rbio_pages(rbio); |
1750 | 1860 | ||
1751 | for (pagenr = 0; pagenr < nr_pages; pagenr++) { | 1861 | for (pagenr = 0; pagenr < nr_pages; pagenr++) { |
1862 | /* | ||
1863 | * Now we just use bitmap to mark the horizontal stripes in | ||
1864 | * which we have data when doing parity scrub. | ||
1865 | */ | ||
1866 | if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && | ||
1867 | !test_bit(pagenr, rbio->dbitmap)) | ||
1868 | continue; | ||
1869 | |||
1752 | /* setup our array of pointers with pages | 1870 | /* setup our array of pointers with pages |
1753 | * from each stripe | 1871 | * from each stripe |
1754 | */ | 1872 | */ |
1755 | for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) { | 1873 | for (stripe = 0; stripe < rbio->real_stripes; stripe++) { |
1756 | /* | 1874 | /* |
1757 | * if we're rebuilding a read, we have to use | 1875 | * if we're rebuilding a read, we have to use |
1758 | * pages from the bio list | 1876 | * pages from the bio list |
1759 | */ | 1877 | */ |
1760 | if (rbio->read_rebuild && | 1878 | if (rbio->operation == BTRFS_RBIO_READ_REBUILD && |
1761 | (stripe == faila || stripe == failb)) { | 1879 | (stripe == faila || stripe == failb)) { |
1762 | page = page_in_rbio(rbio, stripe, pagenr, 0); | 1880 | page = page_in_rbio(rbio, stripe, pagenr, 0); |
1763 | } else { | 1881 | } else { |
@@ -1767,7 +1885,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) | |||
1767 | } | 1885 | } |
1768 | 1886 | ||
1769 | /* all raid6 handling here */ | 1887 | /* all raid6 handling here */ |
1770 | if (rbio->raid_map[rbio->bbio->num_stripes - 1] == | 1888 | if (rbio->raid_map[rbio->real_stripes - 1] == |
1771 | RAID6_Q_STRIPE) { | 1889 | RAID6_Q_STRIPE) { |
1772 | 1890 | ||
1773 | /* | 1891 | /* |
@@ -1817,10 +1935,10 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) | |||
1817 | } | 1935 | } |
1818 | 1936 | ||
1819 | if (rbio->raid_map[failb] == RAID5_P_STRIPE) { | 1937 | if (rbio->raid_map[failb] == RAID5_P_STRIPE) { |
1820 | raid6_datap_recov(rbio->bbio->num_stripes, | 1938 | raid6_datap_recov(rbio->real_stripes, |
1821 | PAGE_SIZE, faila, pointers); | 1939 | PAGE_SIZE, faila, pointers); |
1822 | } else { | 1940 | } else { |
1823 | raid6_2data_recov(rbio->bbio->num_stripes, | 1941 | raid6_2data_recov(rbio->real_stripes, |
1824 | PAGE_SIZE, faila, failb, | 1942 | PAGE_SIZE, faila, failb, |
1825 | pointers); | 1943 | pointers); |
1826 | } | 1944 | } |
@@ -1850,7 +1968,7 @@ pstripe: | |||
1850 | * know they can be trusted. If this was a read reconstruction, | 1968 | * know they can be trusted. If this was a read reconstruction, |
1851 | * other endio functions will fiddle the uptodate bits | 1969 | * other endio functions will fiddle the uptodate bits |
1852 | */ | 1970 | */ |
1853 | if (!rbio->read_rebuild) { | 1971 | if (rbio->operation == BTRFS_RBIO_WRITE) { |
1854 | for (i = 0; i < nr_pages; i++) { | 1972 | for (i = 0; i < nr_pages; i++) { |
1855 | if (faila != -1) { | 1973 | if (faila != -1) { |
1856 | page = rbio_stripe_page(rbio, faila, i); | 1974 | page = rbio_stripe_page(rbio, faila, i); |
@@ -1862,12 +1980,12 @@ pstripe: | |||
1862 | } | 1980 | } |
1863 | } | 1981 | } |
1864 | } | 1982 | } |
1865 | for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) { | 1983 | for (stripe = 0; stripe < rbio->real_stripes; stripe++) { |
1866 | /* | 1984 | /* |
1867 | * if we're rebuilding a read, we have to use | 1985 | * if we're rebuilding a read, we have to use |
1868 | * pages from the bio list | 1986 | * pages from the bio list |
1869 | */ | 1987 | */ |
1870 | if (rbio->read_rebuild && | 1988 | if (rbio->operation == BTRFS_RBIO_READ_REBUILD && |
1871 | (stripe == faila || stripe == failb)) { | 1989 | (stripe == faila || stripe == failb)) { |
1872 | page = page_in_rbio(rbio, stripe, pagenr, 0); | 1990 | page = page_in_rbio(rbio, stripe, pagenr, 0); |
1873 | } else { | 1991 | } else { |
@@ -1882,9 +2000,9 @@ cleanup: | |||
1882 | kfree(pointers); | 2000 | kfree(pointers); |
1883 | 2001 | ||
1884 | cleanup_io: | 2002 | cleanup_io: |
1885 | 2003 | if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { | |
1886 | if (rbio->read_rebuild) { | 2004 | if (err == 0 && |
1887 | if (err == 0) | 2005 | !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags)) |
1888 | cache_rbio_pages(rbio); | 2006 | cache_rbio_pages(rbio); |
1889 | else | 2007 | else |
1890 | clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); | 2008 | clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); |
@@ -1893,7 +2011,13 @@ cleanup_io: | |||
1893 | } else if (err == 0) { | 2011 | } else if (err == 0) { |
1894 | rbio->faila = -1; | 2012 | rbio->faila = -1; |
1895 | rbio->failb = -1; | 2013 | rbio->failb = -1; |
1896 | finish_rmw(rbio); | 2014 | |
2015 | if (rbio->operation == BTRFS_RBIO_WRITE) | ||
2016 | finish_rmw(rbio); | ||
2017 | else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) | ||
2018 | finish_parity_scrub(rbio, 0); | ||
2019 | else | ||
2020 | BUG(); | ||
1897 | } else { | 2021 | } else { |
1898 | rbio_orig_end_io(rbio, err, 0); | 2022 | rbio_orig_end_io(rbio, err, 0); |
1899 | } | 2023 | } |
@@ -1917,10 +2041,10 @@ static void raid_recover_end_io(struct bio *bio, int err) | |||
1917 | set_bio_pages_uptodate(bio); | 2041 | set_bio_pages_uptodate(bio); |
1918 | bio_put(bio); | 2042 | bio_put(bio); |
1919 | 2043 | ||
1920 | if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) | 2044 | if (!atomic_dec_and_test(&rbio->stripes_pending)) |
1921 | return; | 2045 | return; |
1922 | 2046 | ||
1923 | if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) | 2047 | if (atomic_read(&rbio->error) > rbio->bbio->max_errors) |
1924 | rbio_orig_end_io(rbio, -EIO, 0); | 2048 | rbio_orig_end_io(rbio, -EIO, 0); |
1925 | else | 2049 | else |
1926 | __raid_recover_end_io(rbio); | 2050 | __raid_recover_end_io(rbio); |
@@ -1937,7 +2061,6 @@ static void raid_recover_end_io(struct bio *bio, int err) | |||
1937 | static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) | 2061 | static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) |
1938 | { | 2062 | { |
1939 | int bios_to_read = 0; | 2063 | int bios_to_read = 0; |
1940 | struct btrfs_bio *bbio = rbio->bbio; | ||
1941 | struct bio_list bio_list; | 2064 | struct bio_list bio_list; |
1942 | int ret; | 2065 | int ret; |
1943 | int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); | 2066 | int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); |
@@ -1951,16 +2074,16 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) | |||
1951 | if (ret) | 2074 | if (ret) |
1952 | goto cleanup; | 2075 | goto cleanup; |
1953 | 2076 | ||
1954 | atomic_set(&rbio->bbio->error, 0); | 2077 | atomic_set(&rbio->error, 0); |
1955 | 2078 | ||
1956 | /* | 2079 | /* |
1957 | * read everything that hasn't failed. Thanks to the | 2080 | * read everything that hasn't failed. Thanks to the |
1958 | * stripe cache, it is possible that some or all of these | 2081 | * stripe cache, it is possible that some or all of these |
1959 | * pages are going to be uptodate. | 2082 | * pages are going to be uptodate. |
1960 | */ | 2083 | */ |
1961 | for (stripe = 0; stripe < bbio->num_stripes; stripe++) { | 2084 | for (stripe = 0; stripe < rbio->real_stripes; stripe++) { |
1962 | if (rbio->faila == stripe || rbio->failb == stripe) { | 2085 | if (rbio->faila == stripe || rbio->failb == stripe) { |
1963 | atomic_inc(&rbio->bbio->error); | 2086 | atomic_inc(&rbio->error); |
1964 | continue; | 2087 | continue; |
1965 | } | 2088 | } |
1966 | 2089 | ||
@@ -1990,7 +2113,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) | |||
1990 | * were up to date, or we might have no bios to read because | 2113 | * were up to date, or we might have no bios to read because |
1991 | * the devices were gone. | 2114 | * the devices were gone. |
1992 | */ | 2115 | */ |
1993 | if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) { | 2116 | if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) { |
1994 | __raid_recover_end_io(rbio); | 2117 | __raid_recover_end_io(rbio); |
1995 | goto out; | 2118 | goto out; |
1996 | } else { | 2119 | } else { |
@@ -2002,7 +2125,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) | |||
2002 | * the bbio may be freed once we submit the last bio. Make sure | 2125 | * the bbio may be freed once we submit the last bio. Make sure |
2003 | * not to touch it after that | 2126 | * not to touch it after that |
2004 | */ | 2127 | */ |
2005 | atomic_set(&bbio->stripes_pending, bios_to_read); | 2128 | atomic_set(&rbio->stripes_pending, bios_to_read); |
2006 | while (1) { | 2129 | while (1) { |
2007 | bio = bio_list_pop(&bio_list); | 2130 | bio = bio_list_pop(&bio_list); |
2008 | if (!bio) | 2131 | if (!bio) |
@@ -2021,7 +2144,7 @@ out: | |||
2021 | return 0; | 2144 | return 0; |
2022 | 2145 | ||
2023 | cleanup: | 2146 | cleanup: |
2024 | if (rbio->read_rebuild) | 2147 | if (rbio->operation == BTRFS_RBIO_READ_REBUILD) |
2025 | rbio_orig_end_io(rbio, -EIO, 0); | 2148 | rbio_orig_end_io(rbio, -EIO, 0); |
2026 | return -EIO; | 2149 | return -EIO; |
2027 | } | 2150 | } |
@@ -2034,34 +2157,42 @@ cleanup: | |||
2034 | */ | 2157 | */ |
2035 | int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, | 2158 | int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, |
2036 | struct btrfs_bio *bbio, u64 *raid_map, | 2159 | struct btrfs_bio *bbio, u64 *raid_map, |
2037 | u64 stripe_len, int mirror_num) | 2160 | u64 stripe_len, int mirror_num, int generic_io) |
2038 | { | 2161 | { |
2039 | struct btrfs_raid_bio *rbio; | 2162 | struct btrfs_raid_bio *rbio; |
2040 | int ret; | 2163 | int ret; |
2041 | 2164 | ||
2042 | rbio = alloc_rbio(root, bbio, raid_map, stripe_len); | 2165 | rbio = alloc_rbio(root, bbio, raid_map, stripe_len); |
2043 | if (IS_ERR(rbio)) | 2166 | if (IS_ERR(rbio)) { |
2167 | __free_bbio_and_raid_map(bbio, raid_map, generic_io); | ||
2044 | return PTR_ERR(rbio); | 2168 | return PTR_ERR(rbio); |
2169 | } | ||
2045 | 2170 | ||
2046 | rbio->read_rebuild = 1; | 2171 | rbio->operation = BTRFS_RBIO_READ_REBUILD; |
2047 | bio_list_add(&rbio->bio_list, bio); | 2172 | bio_list_add(&rbio->bio_list, bio); |
2048 | rbio->bio_list_bytes = bio->bi_iter.bi_size; | 2173 | rbio->bio_list_bytes = bio->bi_iter.bi_size; |
2049 | 2174 | ||
2050 | rbio->faila = find_logical_bio_stripe(rbio, bio); | 2175 | rbio->faila = find_logical_bio_stripe(rbio, bio); |
2051 | if (rbio->faila == -1) { | 2176 | if (rbio->faila == -1) { |
2052 | BUG(); | 2177 | BUG(); |
2053 | kfree(raid_map); | 2178 | __free_bbio_and_raid_map(bbio, raid_map, generic_io); |
2054 | kfree(bbio); | ||
2055 | kfree(rbio); | 2179 | kfree(rbio); |
2056 | return -EIO; | 2180 | return -EIO; |
2057 | } | 2181 | } |
2058 | 2182 | ||
2183 | if (generic_io) { | ||
2184 | btrfs_bio_counter_inc_noblocked(root->fs_info); | ||
2185 | rbio->generic_bio_cnt = 1; | ||
2186 | } else { | ||
2187 | set_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags); | ||
2188 | } | ||
2189 | |||
2059 | /* | 2190 | /* |
2060 | * reconstruct from the q stripe if they are | 2191 | * reconstruct from the q stripe if they are |
2061 | * asking for mirror 3 | 2192 | * asking for mirror 3 |
2062 | */ | 2193 | */ |
2063 | if (mirror_num == 3) | 2194 | if (mirror_num == 3) |
2064 | rbio->failb = bbio->num_stripes - 2; | 2195 | rbio->failb = rbio->real_stripes - 2; |
2065 | 2196 | ||
2066 | ret = lock_stripe_add(rbio); | 2197 | ret = lock_stripe_add(rbio); |
2067 | 2198 | ||
@@ -2098,3 +2229,483 @@ static void read_rebuild_work(struct btrfs_work *work) | |||
2098 | rbio = container_of(work, struct btrfs_raid_bio, work); | 2229 | rbio = container_of(work, struct btrfs_raid_bio, work); |
2099 | __raid56_parity_recover(rbio); | 2230 | __raid56_parity_recover(rbio); |
2100 | } | 2231 | } |
2232 | |||
2233 | /* | ||
2234 | * The following code is used to scrub/replace the parity stripe | ||
2235 | * | ||
2236 | * Note: We need make sure all the pages that add into the scrub/replace | ||
2237 | * raid bio are correct and not be changed during the scrub/replace. That | ||
2238 | * is those pages just hold metadata or file data with checksum. | ||
2239 | */ | ||
2240 | |||
2241 | struct btrfs_raid_bio * | ||
2242 | raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio, | ||
2243 | struct btrfs_bio *bbio, u64 *raid_map, | ||
2244 | u64 stripe_len, struct btrfs_device *scrub_dev, | ||
2245 | unsigned long *dbitmap, int stripe_nsectors) | ||
2246 | { | ||
2247 | struct btrfs_raid_bio *rbio; | ||
2248 | int i; | ||
2249 | |||
2250 | rbio = alloc_rbio(root, bbio, raid_map, stripe_len); | ||
2251 | if (IS_ERR(rbio)) | ||
2252 | return NULL; | ||
2253 | bio_list_add(&rbio->bio_list, bio); | ||
2254 | /* | ||
2255 | * This is a special bio which is used to hold the completion handler | ||
2256 | * and make the scrub rbio is similar to the other types | ||
2257 | */ | ||
2258 | ASSERT(!bio->bi_iter.bi_size); | ||
2259 | rbio->operation = BTRFS_RBIO_PARITY_SCRUB; | ||
2260 | |||
2261 | for (i = 0; i < rbio->real_stripes; i++) { | ||
2262 | if (bbio->stripes[i].dev == scrub_dev) { | ||
2263 | rbio->scrubp = i; | ||
2264 | break; | ||
2265 | } | ||
2266 | } | ||
2267 | |||
2268 | /* Now we just support the sectorsize equals to page size */ | ||
2269 | ASSERT(root->sectorsize == PAGE_SIZE); | ||
2270 | ASSERT(rbio->stripe_npages == stripe_nsectors); | ||
2271 | bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors); | ||
2272 | |||
2273 | return rbio; | ||
2274 | } | ||
2275 | |||
2276 | void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio, | ||
2277 | struct page *page, u64 logical) | ||
2278 | { | ||
2279 | int stripe_offset; | ||
2280 | int index; | ||
2281 | |||
2282 | ASSERT(logical >= rbio->raid_map[0]); | ||
2283 | ASSERT(logical + PAGE_SIZE <= rbio->raid_map[0] + | ||
2284 | rbio->stripe_len * rbio->nr_data); | ||
2285 | stripe_offset = (int)(logical - rbio->raid_map[0]); | ||
2286 | index = stripe_offset >> PAGE_CACHE_SHIFT; | ||
2287 | rbio->bio_pages[index] = page; | ||
2288 | } | ||
2289 | |||
2290 | /* | ||
2291 | * We just scrub the parity that we have correct data on the same horizontal, | ||
2292 | * so we needn't allocate all pages for all the stripes. | ||
2293 | */ | ||
2294 | static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) | ||
2295 | { | ||
2296 | int i; | ||
2297 | int bit; | ||
2298 | int index; | ||
2299 | struct page *page; | ||
2300 | |||
2301 | for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) { | ||
2302 | for (i = 0; i < rbio->real_stripes; i++) { | ||
2303 | index = i * rbio->stripe_npages + bit; | ||
2304 | if (rbio->stripe_pages[index]) | ||
2305 | continue; | ||
2306 | |||
2307 | page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); | ||
2308 | if (!page) | ||
2309 | return -ENOMEM; | ||
2310 | rbio->stripe_pages[index] = page; | ||
2311 | ClearPageUptodate(page); | ||
2312 | } | ||
2313 | } | ||
2314 | return 0; | ||
2315 | } | ||
2316 | |||
2317 | /* | ||
2318 | * end io function used by finish_rmw. When we finally | ||
2319 | * get here, we've written a full stripe | ||
2320 | */ | ||
2321 | static void raid_write_parity_end_io(struct bio *bio, int err) | ||
2322 | { | ||
2323 | struct btrfs_raid_bio *rbio = bio->bi_private; | ||
2324 | |||
2325 | if (err) | ||
2326 | fail_bio_stripe(rbio, bio); | ||
2327 | |||
2328 | bio_put(bio); | ||
2329 | |||
2330 | if (!atomic_dec_and_test(&rbio->stripes_pending)) | ||
2331 | return; | ||
2332 | |||
2333 | err = 0; | ||
2334 | |||
2335 | if (atomic_read(&rbio->error)) | ||
2336 | err = -EIO; | ||
2337 | |||
2338 | rbio_orig_end_io(rbio, err, 0); | ||
2339 | } | ||
2340 | |||
2341 | static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, | ||
2342 | int need_check) | ||
2343 | { | ||
2344 | struct btrfs_bio *bbio = rbio->bbio; | ||
2345 | void *pointers[rbio->real_stripes]; | ||
2346 | DECLARE_BITMAP(pbitmap, rbio->stripe_npages); | ||
2347 | int nr_data = rbio->nr_data; | ||
2348 | int stripe; | ||
2349 | int pagenr; | ||
2350 | int p_stripe = -1; | ||
2351 | int q_stripe = -1; | ||
2352 | struct page *p_page = NULL; | ||
2353 | struct page *q_page = NULL; | ||
2354 | struct bio_list bio_list; | ||
2355 | struct bio *bio; | ||
2356 | int is_replace = 0; | ||
2357 | int ret; | ||
2358 | |||
2359 | bio_list_init(&bio_list); | ||
2360 | |||
2361 | if (rbio->real_stripes - rbio->nr_data == 1) { | ||
2362 | p_stripe = rbio->real_stripes - 1; | ||
2363 | } else if (rbio->real_stripes - rbio->nr_data == 2) { | ||
2364 | p_stripe = rbio->real_stripes - 2; | ||
2365 | q_stripe = rbio->real_stripes - 1; | ||
2366 | } else { | ||
2367 | BUG(); | ||
2368 | } | ||
2369 | |||
2370 | if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) { | ||
2371 | is_replace = 1; | ||
2372 | bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages); | ||
2373 | } | ||
2374 | |||
2375 | /* | ||
2376 | * Because the higher layers(scrubber) are unlikely to | ||
2377 | * use this area of the disk again soon, so don't cache | ||
2378 | * it. | ||
2379 | */ | ||
2380 | clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); | ||
2381 | |||
2382 | if (!need_check) | ||
2383 | goto writeback; | ||
2384 | |||
2385 | p_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); | ||
2386 | if (!p_page) | ||
2387 | goto cleanup; | ||
2388 | SetPageUptodate(p_page); | ||
2389 | |||
2390 | if (q_stripe != -1) { | ||
2391 | q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); | ||
2392 | if (!q_page) { | ||
2393 | __free_page(p_page); | ||
2394 | goto cleanup; | ||
2395 | } | ||
2396 | SetPageUptodate(q_page); | ||
2397 | } | ||
2398 | |||
2399 | atomic_set(&rbio->error, 0); | ||
2400 | |||
2401 | for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { | ||
2402 | struct page *p; | ||
2403 | void *parity; | ||
2404 | /* first collect one page from each data stripe */ | ||
2405 | for (stripe = 0; stripe < nr_data; stripe++) { | ||
2406 | p = page_in_rbio(rbio, stripe, pagenr, 0); | ||
2407 | pointers[stripe] = kmap(p); | ||
2408 | } | ||
2409 | |||
2410 | /* then add the parity stripe */ | ||
2411 | pointers[stripe++] = kmap(p_page); | ||
2412 | |||
2413 | if (q_stripe != -1) { | ||
2414 | |||
2415 | /* | ||
2416 | * raid6, add the qstripe and call the | ||
2417 | * library function to fill in our p/q | ||
2418 | */ | ||
2419 | pointers[stripe++] = kmap(q_page); | ||
2420 | |||
2421 | raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE, | ||
2422 | pointers); | ||
2423 | } else { | ||
2424 | /* raid5 */ | ||
2425 | memcpy(pointers[nr_data], pointers[0], PAGE_SIZE); | ||
2426 | run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE); | ||
2427 | } | ||
2428 | |||
2429 | /* Check scrubbing pairty and repair it */ | ||
2430 | p = rbio_stripe_page(rbio, rbio->scrubp, pagenr); | ||
2431 | parity = kmap(p); | ||
2432 | if (memcmp(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE)) | ||
2433 | memcpy(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE); | ||
2434 | else | ||
2435 | /* Parity is right, needn't writeback */ | ||
2436 | bitmap_clear(rbio->dbitmap, pagenr, 1); | ||
2437 | kunmap(p); | ||
2438 | |||
2439 | for (stripe = 0; stripe < rbio->real_stripes; stripe++) | ||
2440 | kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); | ||
2441 | } | ||
2442 | |||
2443 | __free_page(p_page); | ||
2444 | if (q_page) | ||
2445 | __free_page(q_page); | ||
2446 | |||
2447 | writeback: | ||
2448 | /* | ||
2449 | * time to start writing. Make bios for everything from the | ||
2450 | * higher layers (the bio_list in our rbio) and our p/q. Ignore | ||
2451 | * everything else. | ||
2452 | */ | ||
2453 | for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { | ||
2454 | struct page *page; | ||
2455 | |||
2456 | page = rbio_stripe_page(rbio, rbio->scrubp, pagenr); | ||
2457 | ret = rbio_add_io_page(rbio, &bio_list, | ||
2458 | page, rbio->scrubp, pagenr, rbio->stripe_len); | ||
2459 | if (ret) | ||
2460 | goto cleanup; | ||
2461 | } | ||
2462 | |||
2463 | if (!is_replace) | ||
2464 | goto submit_write; | ||
2465 | |||
2466 | for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) { | ||
2467 | struct page *page; | ||
2468 | |||
2469 | page = rbio_stripe_page(rbio, rbio->scrubp, pagenr); | ||
2470 | ret = rbio_add_io_page(rbio, &bio_list, page, | ||
2471 | bbio->tgtdev_map[rbio->scrubp], | ||
2472 | pagenr, rbio->stripe_len); | ||
2473 | if (ret) | ||
2474 | goto cleanup; | ||
2475 | } | ||
2476 | |||
2477 | submit_write: | ||
2478 | nr_data = bio_list_size(&bio_list); | ||
2479 | if (!nr_data) { | ||
2480 | /* Every parity is right */ | ||
2481 | rbio_orig_end_io(rbio, 0, 0); | ||
2482 | return; | ||
2483 | } | ||
2484 | |||
2485 | atomic_set(&rbio->stripes_pending, nr_data); | ||
2486 | |||
2487 | while (1) { | ||
2488 | bio = bio_list_pop(&bio_list); | ||
2489 | if (!bio) | ||
2490 | break; | ||
2491 | |||
2492 | bio->bi_private = rbio; | ||
2493 | bio->bi_end_io = raid_write_parity_end_io; | ||
2494 | BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); | ||
2495 | submit_bio(WRITE, bio); | ||
2496 | } | ||
2497 | return; | ||
2498 | |||
2499 | cleanup: | ||
2500 | rbio_orig_end_io(rbio, -EIO, 0); | ||
2501 | } | ||
2502 | |||
2503 | static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) | ||
2504 | { | ||
2505 | if (stripe >= 0 && stripe < rbio->nr_data) | ||
2506 | return 1; | ||
2507 | return 0; | ||
2508 | } | ||
2509 | |||
2510 | /* | ||
2511 | * While we're doing the parity check and repair, we could have errors | ||
2512 | * in reading pages off the disk. This checks for errors and if we're | ||
2513 | * not able to read the page it'll trigger parity reconstruction. The | ||
2514 | * parity scrub will be finished after we've reconstructed the failed | ||
2515 | * stripes | ||
2516 | */ | ||
2517 | static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio) | ||
2518 | { | ||
2519 | if (atomic_read(&rbio->error) > rbio->bbio->max_errors) | ||
2520 | goto cleanup; | ||
2521 | |||
2522 | if (rbio->faila >= 0 || rbio->failb >= 0) { | ||
2523 | int dfail = 0, failp = -1; | ||
2524 | |||
2525 | if (is_data_stripe(rbio, rbio->faila)) | ||
2526 | dfail++; | ||
2527 | else if (is_parity_stripe(rbio->faila)) | ||
2528 | failp = rbio->faila; | ||
2529 | |||
2530 | if (is_data_stripe(rbio, rbio->failb)) | ||
2531 | dfail++; | ||
2532 | else if (is_parity_stripe(rbio->failb)) | ||
2533 | failp = rbio->failb; | ||
2534 | |||
2535 | /* | ||
2536 | * Because we can not use a scrubbing parity to repair | ||
2537 | * the data, so the capability of the repair is declined. | ||
2538 | * (In the case of RAID5, we can not repair anything) | ||
2539 | */ | ||
2540 | if (dfail > rbio->bbio->max_errors - 1) | ||
2541 | goto cleanup; | ||
2542 | |||
2543 | /* | ||
2544 | * If all data is good, only parity is correctly, just | ||
2545 | * repair the parity. | ||
2546 | */ | ||
2547 | if (dfail == 0) { | ||
2548 | finish_parity_scrub(rbio, 0); | ||
2549 | return; | ||
2550 | } | ||
2551 | |||
2552 | /* | ||
2553 | * Here means we got one corrupted data stripe and one | ||
2554 | * corrupted parity on RAID6, if the corrupted parity | ||
2555 | * is scrubbing parity, luckly, use the other one to repair | ||
2556 | * the data, or we can not repair the data stripe. | ||
2557 | */ | ||
2558 | if (failp != rbio->scrubp) | ||
2559 | goto cleanup; | ||
2560 | |||
2561 | __raid_recover_end_io(rbio); | ||
2562 | } else { | ||
2563 | finish_parity_scrub(rbio, 1); | ||
2564 | } | ||
2565 | return; | ||
2566 | |||
2567 | cleanup: | ||
2568 | rbio_orig_end_io(rbio, -EIO, 0); | ||
2569 | } | ||
2570 | |||
2571 | /* | ||
2572 | * end io for the read phase of the rmw cycle. All the bios here are physical | ||
2573 | * stripe bios we've read from the disk so we can recalculate the parity of the | ||
2574 | * stripe. | ||
2575 | * | ||
2576 | * This will usually kick off finish_rmw once all the bios are read in, but it | ||
2577 | * may trigger parity reconstruction if we had any errors along the way | ||
2578 | */ | ||
2579 | static void raid56_parity_scrub_end_io(struct bio *bio, int err) | ||
2580 | { | ||
2581 | struct btrfs_raid_bio *rbio = bio->bi_private; | ||
2582 | |||
2583 | if (err) | ||
2584 | fail_bio_stripe(rbio, bio); | ||
2585 | else | ||
2586 | set_bio_pages_uptodate(bio); | ||
2587 | |||
2588 | bio_put(bio); | ||
2589 | |||
2590 | if (!atomic_dec_and_test(&rbio->stripes_pending)) | ||
2591 | return; | ||
2592 | |||
2593 | /* | ||
2594 | * this will normally call finish_rmw to start our write | ||
2595 | * but if there are any failed stripes we'll reconstruct | ||
2596 | * from parity first | ||
2597 | */ | ||
2598 | validate_rbio_for_parity_scrub(rbio); | ||
2599 | } | ||
2600 | |||
2601 | static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) | ||
2602 | { | ||
2603 | int bios_to_read = 0; | ||
2604 | struct bio_list bio_list; | ||
2605 | int ret; | ||
2606 | int pagenr; | ||
2607 | int stripe; | ||
2608 | struct bio *bio; | ||
2609 | |||
2610 | ret = alloc_rbio_essential_pages(rbio); | ||
2611 | if (ret) | ||
2612 | goto cleanup; | ||
2613 | |||
2614 | bio_list_init(&bio_list); | ||
2615 | |||
2616 | atomic_set(&rbio->error, 0); | ||
2617 | /* | ||
2618 | * build a list of bios to read all the missing parts of this | ||
2619 | * stripe | ||
2620 | */ | ||
2621 | for (stripe = 0; stripe < rbio->real_stripes; stripe++) { | ||
2622 | for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { | ||
2623 | struct page *page; | ||
2624 | /* | ||
2625 | * we want to find all the pages missing from | ||
2626 | * the rbio and read them from the disk. If | ||
2627 | * page_in_rbio finds a page in the bio list | ||
2628 | * we don't need to read it off the stripe. | ||
2629 | */ | ||
2630 | page = page_in_rbio(rbio, stripe, pagenr, 1); | ||
2631 | if (page) | ||
2632 | continue; | ||
2633 | |||
2634 | page = rbio_stripe_page(rbio, stripe, pagenr); | ||
2635 | /* | ||
2636 | * the bio cache may have handed us an uptodate | ||
2637 | * page. If so, be happy and use it | ||
2638 | */ | ||
2639 | if (PageUptodate(page)) | ||
2640 | continue; | ||
2641 | |||
2642 | ret = rbio_add_io_page(rbio, &bio_list, page, | ||
2643 | stripe, pagenr, rbio->stripe_len); | ||
2644 | if (ret) | ||
2645 | goto cleanup; | ||
2646 | } | ||
2647 | } | ||
2648 | |||
2649 | bios_to_read = bio_list_size(&bio_list); | ||
2650 | if (!bios_to_read) { | ||
2651 | /* | ||
2652 | * this can happen if others have merged with | ||
2653 | * us, it means there is nothing left to read. | ||
2654 | * But if there are missing devices it may not be | ||
2655 | * safe to do the full stripe write yet. | ||
2656 | */ | ||
2657 | goto finish; | ||
2658 | } | ||
2659 | |||
2660 | /* | ||
2661 | * the bbio may be freed once we submit the last bio. Make sure | ||
2662 | * not to touch it after that | ||
2663 | */ | ||
2664 | atomic_set(&rbio->stripes_pending, bios_to_read); | ||
2665 | while (1) { | ||
2666 | bio = bio_list_pop(&bio_list); | ||
2667 | if (!bio) | ||
2668 | break; | ||
2669 | |||
2670 | bio->bi_private = rbio; | ||
2671 | bio->bi_end_io = raid56_parity_scrub_end_io; | ||
2672 | |||
2673 | btrfs_bio_wq_end_io(rbio->fs_info, bio, | ||
2674 | BTRFS_WQ_ENDIO_RAID56); | ||
2675 | |||
2676 | BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); | ||
2677 | submit_bio(READ, bio); | ||
2678 | } | ||
2679 | /* the actual write will happen once the reads are done */ | ||
2680 | return; | ||
2681 | |||
2682 | cleanup: | ||
2683 | rbio_orig_end_io(rbio, -EIO, 0); | ||
2684 | return; | ||
2685 | |||
2686 | finish: | ||
2687 | validate_rbio_for_parity_scrub(rbio); | ||
2688 | } | ||
2689 | |||
2690 | static void scrub_parity_work(struct btrfs_work *work) | ||
2691 | { | ||
2692 | struct btrfs_raid_bio *rbio; | ||
2693 | |||
2694 | rbio = container_of(work, struct btrfs_raid_bio, work); | ||
2695 | raid56_parity_scrub_stripe(rbio); | ||
2696 | } | ||
2697 | |||
2698 | static void async_scrub_parity(struct btrfs_raid_bio *rbio) | ||
2699 | { | ||
2700 | btrfs_init_work(&rbio->work, btrfs_rmw_helper, | ||
2701 | scrub_parity_work, NULL, NULL); | ||
2702 | |||
2703 | btrfs_queue_work(rbio->fs_info->rmw_workers, | ||
2704 | &rbio->work); | ||
2705 | } | ||
2706 | |||
2707 | void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) | ||
2708 | { | ||
2709 | if (!lock_stripe_add(rbio)) | ||
2710 | async_scrub_parity(rbio); | ||
2711 | } | ||